diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,85322 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 6560, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 901.0000305175781, + "epoch": 0.0001524390243902439, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 4.573170731707317e-09, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1 + }, + { + "completion_length": 1130.8333740234375, + "epoch": 0.0003048780487804878, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 9.146341463414635e-09, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2 + }, + { + "completion_length": 901.6667175292969, + "epoch": 0.00045731707317073173, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 1.371951219512195e-08, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3 + }, + { + "completion_length": 918.1666870117188, + "epoch": 0.0006097560975609756, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 1.829268292682927e-08, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4 + }, + { + "completion_length": 1087.3333740234375, + "epoch": 0.0007621951219512195, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 2.2865853658536585e-08, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5 + }, + { + "completion_length": 1691.0001220703125, + "epoch": 0.0009146341463414635, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 2.74390243902439e-08, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6 + }, + { + "completion_length": 1565.1667175292969, + "epoch": 0.0010670731707317074, + "grad_norm": 2.003875988357694, + "kl": 0.0, + "learning_rate": 3.201219512195122e-08, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 7 + }, + { + "completion_length": 1365.1666870117188, + "epoch": 0.0012195121951219512, + "grad_norm": 0.010555593872609102, + "kl": 0.00015544891357421875, + "learning_rate": 3.658536585365854e-08, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 8 + }, + { + "completion_length": 741.6666870117188, + "epoch": 0.001371951219512195, + "grad_norm": 0.02579690236939944, + "kl": 0.00016880035400390625, + "learning_rate": 4.115853658536586e-08, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 9 + }, + { + "completion_length": 1462.1666870117188, + "epoch": 0.001524390243902439, + "grad_norm": 0.0188649248567718, + "kl": 0.00018072128295898438, + "learning_rate": 4.573170731707317e-08, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 10 + }, + { + "completion_length": 1171.3333740234375, + "epoch": 0.001676829268292683, + "grad_norm": 0.01165979938325126, + "kl": 0.00014662742614746094, + "learning_rate": 5.0304878048780495e-08, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 11 + }, + { + "completion_length": 655.0000305175781, + "epoch": 0.001829268292682927, + "grad_norm": 3.922286216007927, + "kl": 0.00011968612670898438, + "learning_rate": 5.48780487804878e-08, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 12 + }, + { + "completion_length": 1211.0000610351562, + "epoch": 0.0019817073170731708, + "grad_norm": 0.010021424068282783, + "kl": 0.000125885009765625, + "learning_rate": 5.9451219512195127e-08, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 13 + }, + { + "completion_length": 906.5000305175781, + "epoch": 0.002134146341463415, + "grad_norm": 2.120364536932112, + "kl": 0.00020551681518554688, + "learning_rate": 6.402439024390244e-08, + "loss": 0.0, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 14 + }, + { + "completion_length": 973.8333435058594, + "epoch": 0.0022865853658536584, + "grad_norm": 1.854307993736899, + "kl": 0.00014781951904296875, + "learning_rate": 6.859756097560975e-08, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 15 + }, + { + "completion_length": 1248.3333740234375, + "epoch": 0.0024390243902439024, + "grad_norm": 0.013779825471303365, + "kl": 0.00016307830810546875, + "learning_rate": 7.317073170731708e-08, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 16 + }, + { + "completion_length": 931.1666870117188, + "epoch": 0.0025914634146341465, + "grad_norm": 2.2617274008954107, + "kl": 0.00022220611572265625, + "learning_rate": 7.774390243902439e-08, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 17 + }, + { + "completion_length": 809.8333435058594, + "epoch": 0.00274390243902439, + "grad_norm": 0.017252683203387877, + "kl": 0.0001926422119140625, + "learning_rate": 8.231707317073171e-08, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 18 + }, + { + "completion_length": 1187.3333435058594, + "epoch": 0.002896341463414634, + "grad_norm": 0.015395331689378112, + "kl": 0.00022745132446289062, + "learning_rate": 8.689024390243903e-08, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 19 + }, + { + "completion_length": 1211.0000305175781, + "epoch": 0.003048780487804878, + "grad_norm": 0.008365026324972633, + "kl": 0.00016260147094726562, + "learning_rate": 9.146341463414634e-08, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 20 + }, + { + "completion_length": 1065.6666717529297, + "epoch": 0.0032012195121951218, + "grad_norm": 0.0186331098936123, + "kl": 0.00014448165893554688, + "learning_rate": 9.603658536585367e-08, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 21 + }, + { + "completion_length": 1217.0000610351562, + "epoch": 0.003353658536585366, + "grad_norm": 0.009592113077264, + "kl": 0.0001678466796875, + "learning_rate": 1.0060975609756099e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 22 + }, + { + "completion_length": 725.5000305175781, + "epoch": 0.00350609756097561, + "grad_norm": 0.027195232683233445, + "kl": 0.0001926422119140625, + "learning_rate": 1.051829268292683e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 23 + }, + { + "completion_length": 1233.5000305175781, + "epoch": 0.003658536585365854, + "grad_norm": 0.019076973571722427, + "kl": 0.00019121170043945312, + "learning_rate": 1.097560975609756e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 24 + }, + { + "completion_length": 988.8333740234375, + "epoch": 0.0038109756097560975, + "grad_norm": 2.5818171585550482, + "kl": 0.00016880035400390625, + "learning_rate": 1.1432926829268293e-07, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 25 + }, + { + "completion_length": 1175.5000610351562, + "epoch": 0.0039634146341463415, + "grad_norm": 0.01347163495277656, + "kl": 0.00014591217041015625, + "learning_rate": 1.1890243902439025e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 26 + }, + { + "completion_length": 911.0000305175781, + "epoch": 0.0041158536585365856, + "grad_norm": 0.0332877876762154, + "kl": 0.00022363662719726562, + "learning_rate": 1.2347560975609758e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 27 + }, + { + "completion_length": 881.8333740234375, + "epoch": 0.00426829268292683, + "grad_norm": 0.019466511418422226, + "kl": 0.00019359588623046875, + "learning_rate": 1.2804878048780488e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 28 + }, + { + "completion_length": 1876.0, + "epoch": 0.004420731707317073, + "grad_norm": 0.010528932767594253, + "kl": 0.00016069412231445312, + "learning_rate": 1.326219512195122e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 29 + }, + { + "completion_length": 1448.0000610351562, + "epoch": 0.004573170731707317, + "grad_norm": 0.01083661537826827, + "kl": 0.000164031982421875, + "learning_rate": 1.371951219512195e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 30 + }, + { + "completion_length": 1325.0000610351562, + "epoch": 0.004725609756097561, + "grad_norm": 0.02582948767035922, + "kl": 0.00022602081298828125, + "learning_rate": 1.4176829268292683e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 31 + }, + { + "completion_length": 1294.5000610351562, + "epoch": 0.004878048780487805, + "grad_norm": 0.013646266123944912, + "kl": 0.00020360946655273438, + "learning_rate": 1.4634146341463415e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 32 + }, + { + "completion_length": 876.8333435058594, + "epoch": 0.005030487804878049, + "grad_norm": 0.0197209640790186, + "kl": 0.0001327991485595703, + "learning_rate": 1.5091463414634148e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 33 + }, + { + "completion_length": 1749.6667175292969, + "epoch": 0.005182926829268293, + "grad_norm": 0.016619210921168623, + "kl": 0.0001227855682373047, + "learning_rate": 1.5548780487804878e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 34 + }, + { + "completion_length": 787.3333435058594, + "epoch": 0.005335365853658537, + "grad_norm": 0.05404770348949276, + "kl": 0.000225067138671875, + "learning_rate": 1.600609756097561e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 35 + }, + { + "completion_length": 607.3333587646484, + "epoch": 0.00548780487804878, + "grad_norm": 0.06065538511849115, + "kl": 0.0002808570861816406, + "learning_rate": 1.6463414634146343e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 36 + }, + { + "completion_length": 908.5, + "epoch": 0.005640243902439024, + "grad_norm": 0.02197417845182026, + "kl": 0.00021123886108398438, + "learning_rate": 1.6920731707317073e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 37 + }, + { + "completion_length": 1149.0000305175781, + "epoch": 0.005792682926829268, + "grad_norm": 0.04359076681308879, + "kl": 0.0002384185791015625, + "learning_rate": 1.7378048780487805e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 38 + }, + { + "completion_length": 866.3333740234375, + "epoch": 0.005945121951219512, + "grad_norm": 0.014023301191771817, + "kl": 0.00022029876708984375, + "learning_rate": 1.7835365853658535e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 39 + }, + { + "completion_length": 761.6666870117188, + "epoch": 0.006097560975609756, + "grad_norm": 0.011545847794592163, + "kl": 0.0001125335693359375, + "learning_rate": 1.8292682926829268e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 40 + }, + { + "completion_length": 890.1666870117188, + "epoch": 0.00625, + "grad_norm": 0.01393544696614078, + "kl": 0.00014710426330566406, + "learning_rate": 1.875e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 41 + }, + { + "completion_length": 936.3333435058594, + "epoch": 0.0064024390243902435, + "grad_norm": 0.013158045183788644, + "kl": 0.0002079010009765625, + "learning_rate": 1.9207317073170733e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 42 + }, + { + "completion_length": 800.5000305175781, + "epoch": 0.0065548780487804876, + "grad_norm": 2.3787073689286125, + "kl": 0.0002751350402832031, + "learning_rate": 1.9664634146341466e-07, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 43 + }, + { + "completion_length": 1188.8333740234375, + "epoch": 0.006707317073170732, + "grad_norm": 0.014183517953036897, + "kl": 0.00018978118896484375, + "learning_rate": 2.0121951219512198e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 44 + }, + { + "completion_length": 951.6666870117188, + "epoch": 0.006859756097560976, + "grad_norm": 1.8929455780755233, + "kl": 0.00019073486328125, + "learning_rate": 2.0579268292682928e-07, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 45 + }, + { + "completion_length": 1116.6667175292969, + "epoch": 0.00701219512195122, + "grad_norm": 0.01677431662108465, + "kl": 0.0002455711364746094, + "learning_rate": 2.103658536585366e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 46 + }, + { + "completion_length": 746.6666870117188, + "epoch": 0.007164634146341464, + "grad_norm": 0.012923640673517551, + "kl": 0.000110626220703125, + "learning_rate": 2.1493902439024388e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 47 + }, + { + "completion_length": 1209.6667175292969, + "epoch": 0.007317073170731708, + "grad_norm": 0.018742108973106082, + "kl": 0.00017189979553222656, + "learning_rate": 2.195121951219512e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 48 + }, + { + "completion_length": 1063.0, + "epoch": 0.007469512195121951, + "grad_norm": 0.016321484490344883, + "kl": 0.00016069412231445312, + "learning_rate": 2.2408536585365853e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 49 + }, + { + "completion_length": 2442.6666870117188, + "epoch": 0.007621951219512195, + "grad_norm": 0.012128112857678367, + "kl": 0.00014734268188476562, + "learning_rate": 2.2865853658536586e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 50 + }, + { + "completion_length": 1439.8333740234375, + "epoch": 0.007774390243902439, + "grad_norm": 0.009874953919947003, + "kl": 0.00015544891357421875, + "learning_rate": 2.3323170731707318e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 51 + }, + { + "completion_length": 889.3333587646484, + "epoch": 0.007926829268292683, + "grad_norm": 1.7295684635982773, + "kl": 0.00016927719116210938, + "learning_rate": 2.378048780487805e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 52 + }, + { + "completion_length": 1255.5000610351562, + "epoch": 0.008079268292682927, + "grad_norm": 0.015513045654217648, + "kl": 0.00016736984252929688, + "learning_rate": 2.423780487804878e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 53 + }, + { + "completion_length": 1421.5000610351562, + "epoch": 0.008231707317073171, + "grad_norm": 0.012351596545976316, + "kl": 0.0001926422119140625, + "learning_rate": 2.4695121951219516e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 54 + }, + { + "completion_length": 1230.8333740234375, + "epoch": 0.008384146341463415, + "grad_norm": 0.02876732896839141, + "kl": 0.00019598007202148438, + "learning_rate": 2.5152439024390246e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 55 + }, + { + "completion_length": 1104.8333435058594, + "epoch": 0.00853658536585366, + "grad_norm": 0.013222096694906454, + "kl": 0.000171661376953125, + "learning_rate": 2.5609756097560976e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 56 + }, + { + "completion_length": 1399.5, + "epoch": 0.008689024390243903, + "grad_norm": 0.02000904600301106, + "kl": 0.00022268295288085938, + "learning_rate": 2.6067073170731706e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 57 + }, + { + "completion_length": 876.6667175292969, + "epoch": 0.008841463414634146, + "grad_norm": 2.2637991381676397, + "kl": 0.0002205371856689453, + "learning_rate": 2.652439024390244e-07, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 58 + }, + { + "completion_length": 1772.166748046875, + "epoch": 0.00899390243902439, + "grad_norm": 0.019417505447541082, + "kl": 0.00018644332885742188, + "learning_rate": 2.698170731707317e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 59 + }, + { + "completion_length": 1514.5000610351562, + "epoch": 0.009146341463414634, + "grad_norm": 0.011222398586630577, + "kl": 0.00011181831359863281, + "learning_rate": 2.74390243902439e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 60 + }, + { + "completion_length": 884.6666870117188, + "epoch": 0.009298780487804878, + "grad_norm": 0.026213334354059, + "kl": 0.000324249267578125, + "learning_rate": 2.7896341463414636e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 61 + }, + { + "completion_length": 1139.6666870117188, + "epoch": 0.009451219512195122, + "grad_norm": 3.613347063303942, + "kl": 0.00022363662719726562, + "learning_rate": 2.8353658536585366e-07, + "loss": 0.0, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 62 + }, + { + "completion_length": 1575.1666870117188, + "epoch": 0.009603658536585366, + "grad_norm": 0.01381949057635919, + "kl": 0.0001583099365234375, + "learning_rate": 2.88109756097561e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 63 + }, + { + "completion_length": 856.0000610351562, + "epoch": 0.00975609756097561, + "grad_norm": 0.014186279253998279, + "kl": 0.00017118453979492188, + "learning_rate": 2.926829268292683e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 64 + }, + { + "completion_length": 655.6666870117188, + "epoch": 0.009908536585365854, + "grad_norm": 0.31117693809978814, + "kl": 0.0006389617919921875, + "learning_rate": 2.972560975609756e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 65 + }, + { + "completion_length": 860.0000305175781, + "epoch": 0.010060975609756098, + "grad_norm": 0.014898856307470636, + "kl": 0.0002002716064453125, + "learning_rate": 3.0182926829268296e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 66 + }, + { + "completion_length": 918.6666870117188, + "epoch": 0.010213414634146342, + "grad_norm": 0.025230105666306155, + "kl": 0.0002593994140625, + "learning_rate": 3.0640243902439026e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 67 + }, + { + "completion_length": 848.3333435058594, + "epoch": 0.010365853658536586, + "grad_norm": 2.8659763449779536, + "kl": 0.00022935867309570312, + "learning_rate": 3.1097560975609756e-07, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 68 + }, + { + "completion_length": 704.6666870117188, + "epoch": 0.01051829268292683, + "grad_norm": 0.019907013793585355, + "kl": 0.00020265579223632812, + "learning_rate": 3.1554878048780486e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 69 + }, + { + "completion_length": 1491.6666870117188, + "epoch": 0.010670731707317074, + "grad_norm": 0.014701056839594535, + "kl": 0.00020694732666015625, + "learning_rate": 3.201219512195122e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 70 + }, + { + "completion_length": 621.8333740234375, + "epoch": 0.010823170731707316, + "grad_norm": 0.029470775050479965, + "kl": 0.00039005279541015625, + "learning_rate": 3.246951219512195e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 71 + }, + { + "completion_length": 712.3333435058594, + "epoch": 0.01097560975609756, + "grad_norm": 0.050467613929477365, + "kl": 0.0003833770751953125, + "learning_rate": 3.2926829268292686e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 72 + }, + { + "completion_length": 898.8333435058594, + "epoch": 0.011128048780487804, + "grad_norm": 2.764565470849131, + "kl": 0.00034618377685546875, + "learning_rate": 3.3384146341463416e-07, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 73 + }, + { + "completion_length": 1239.6666870117188, + "epoch": 0.011280487804878048, + "grad_norm": 0.02103147203184694, + "kl": 0.00024318695068359375, + "learning_rate": 3.3841463414634146e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 74 + }, + { + "completion_length": 1744.8333740234375, + "epoch": 0.011432926829268292, + "grad_norm": 0.023677547155862386, + "kl": 0.00037479400634765625, + "learning_rate": 3.429878048780488e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 75 + }, + { + "completion_length": 754.3333740234375, + "epoch": 0.011585365853658536, + "grad_norm": 0.021436914705059706, + "kl": 0.0002994537353515625, + "learning_rate": 3.475609756097561e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 76 + }, + { + "completion_length": 1387.6666870117188, + "epoch": 0.01173780487804878, + "grad_norm": 0.03389036917258919, + "kl": 0.0003643035888671875, + "learning_rate": 3.521341463414634e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 77 + }, + { + "completion_length": 938.5000305175781, + "epoch": 0.011890243902439025, + "grad_norm": 0.02516859462678318, + "kl": 0.00022983551025390625, + "learning_rate": 3.567073170731707e-07, + "loss": 0.0, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 78 + }, + { + "completion_length": 1567.3333435058594, + "epoch": 0.012042682926829269, + "grad_norm": 0.027634795496880186, + "kl": 0.0002455711364746094, + "learning_rate": 3.6128048780487806e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 79 + }, + { + "completion_length": 1278.666748046875, + "epoch": 0.012195121951219513, + "grad_norm": 2.4483697888639586, + "kl": 0.00031566619873046875, + "learning_rate": 3.6585365853658536e-07, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 80 + }, + { + "completion_length": 1224.5, + "epoch": 0.012347560975609757, + "grad_norm": 0.016501412098231437, + "kl": 0.00024127960205078125, + "learning_rate": 3.704268292682927e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 81 + }, + { + "completion_length": 1883.3333740234375, + "epoch": 0.0125, + "grad_norm": 0.01478724988860875, + "kl": 0.00020503997802734375, + "learning_rate": 3.75e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 82 + }, + { + "completion_length": 1158.0, + "epoch": 0.012652439024390245, + "grad_norm": 0.02559516809709328, + "kl": 0.0002803802490234375, + "learning_rate": 3.795731707317073e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 83 + }, + { + "completion_length": 799.6667175292969, + "epoch": 0.012804878048780487, + "grad_norm": 0.03246874632886398, + "kl": 0.00040531158447265625, + "learning_rate": 3.8414634146341466e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 84 + }, + { + "completion_length": 1044.3333740234375, + "epoch": 0.012957317073170731, + "grad_norm": 2.139015514650515, + "kl": 0.0002803802490234375, + "learning_rate": 3.8871951219512196e-07, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 85 + }, + { + "completion_length": 1579.0000610351562, + "epoch": 0.013109756097560975, + "grad_norm": 0.022344512622216112, + "kl": 0.00030040740966796875, + "learning_rate": 3.932926829268293e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 86 + }, + { + "completion_length": 1059.6666870117188, + "epoch": 0.01326219512195122, + "grad_norm": 0.027934118644805925, + "kl": 0.00039768218994140625, + "learning_rate": 3.978658536585366e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 87 + }, + { + "completion_length": 974.3333740234375, + "epoch": 0.013414634146341463, + "grad_norm": 0.027817699877183077, + "kl": 0.000438690185546875, + "learning_rate": 4.0243902439024396e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 88 + }, + { + "completion_length": 786.6666870117188, + "epoch": 0.013567073170731707, + "grad_norm": 0.03213624089478823, + "kl": 0.00039577484130859375, + "learning_rate": 4.0701219512195126e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 89 + }, + { + "completion_length": 660.1666870117188, + "epoch": 0.013719512195121951, + "grad_norm": 3.2889059406625503, + "kl": 0.00054931640625, + "learning_rate": 4.1158536585365856e-07, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 90 + }, + { + "completion_length": 1455.3334350585938, + "epoch": 0.013871951219512195, + "grad_norm": 0.018118987010382313, + "kl": 0.00028705596923828125, + "learning_rate": 4.161585365853659e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 91 + }, + { + "completion_length": 1199.5000610351562, + "epoch": 0.01402439024390244, + "grad_norm": 0.046887154555098995, + "kl": 0.000446319580078125, + "learning_rate": 4.207317073170732e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 92 + }, + { + "completion_length": 903.1666870117188, + "epoch": 0.014176829268292683, + "grad_norm": 2.114762833581505, + "kl": 0.0004930496215820312, + "learning_rate": 4.2530487804878046e-07, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 93 + }, + { + "completion_length": 946.8333740234375, + "epoch": 0.014329268292682927, + "grad_norm": 0.030259657079530843, + "kl": 0.00051116943359375, + "learning_rate": 4.2987804878048776e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 94 + }, + { + "completion_length": 886.3333740234375, + "epoch": 0.014481707317073171, + "grad_norm": 2.110256526692472, + "kl": 0.000919342041015625, + "learning_rate": 4.344512195121951e-07, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 95 + }, + { + "completion_length": 795.0000305175781, + "epoch": 0.014634146341463415, + "grad_norm": 0.03964935012955128, + "kl": 0.000579833984375, + "learning_rate": 4.390243902439024e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 96 + }, + { + "completion_length": 664.1666870117188, + "epoch": 0.014786585365853658, + "grad_norm": 0.047404712392148766, + "kl": 0.000980377197265625, + "learning_rate": 4.4359756097560976e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 97 + }, + { + "completion_length": 1363.8333740234375, + "epoch": 0.014939024390243902, + "grad_norm": 2.993500692857666, + "kl": 0.0011119842529296875, + "learning_rate": 4.4817073170731706e-07, + "loss": 0.0, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 98 + }, + { + "completion_length": 1062.3333740234375, + "epoch": 0.015091463414634146, + "grad_norm": 0.043711866201004645, + "kl": 0.000850677490234375, + "learning_rate": 4.527439024390244e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 99 + }, + { + "completion_length": 1043.1666870117188, + "epoch": 0.01524390243902439, + "grad_norm": 2.469337392617505, + "kl": 0.001827239990234375, + "learning_rate": 4.573170731707317e-07, + "loss": 0.0001, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 100 + }, + { + "completion_length": 796.3333740234375, + "epoch": 0.015396341463414634, + "grad_norm": 5.889516009425318, + "kl": 0.001983642578125, + "learning_rate": 4.61890243902439e-07, + "loss": 0.0001, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 101 + }, + { + "completion_length": 893.8333740234375, + "epoch": 0.015548780487804878, + "grad_norm": 0.059817055891969916, + "kl": 0.001361846923828125, + "learning_rate": 4.6646341463414636e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 102 + }, + { + "completion_length": 1561.6666870117188, + "epoch": 0.015701219512195122, + "grad_norm": 0.047196076003947325, + "kl": 0.001708984375, + "learning_rate": 4.7103658536585366e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 103 + }, + { + "completion_length": 642.5000305175781, + "epoch": 0.015853658536585366, + "grad_norm": 0.1284798402744426, + "kl": 0.00322723388671875, + "learning_rate": 4.75609756097561e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 104 + }, + { + "completion_length": 1396.8333435058594, + "epoch": 0.01600609756097561, + "grad_norm": 0.13233440925677176, + "kl": 0.00243377685546875, + "learning_rate": 4.801829268292683e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 105 + }, + { + "completion_length": 915.6667175292969, + "epoch": 0.016158536585365854, + "grad_norm": 0.09203954241771008, + "kl": 0.0037078857421875, + "learning_rate": 4.847560975609756e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 106 + }, + { + "completion_length": 1257.8333740234375, + "epoch": 0.016310975609756098, + "grad_norm": 3.15591314997433, + "kl": 0.00183868408203125, + "learning_rate": 4.893292682926829e-07, + "loss": 0.0001, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 107 + }, + { + "completion_length": 702.8333740234375, + "epoch": 0.016463414634146342, + "grad_norm": 0.13783491017459432, + "kl": 0.00360870361328125, + "learning_rate": 4.939024390243903e-07, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 108 + }, + { + "completion_length": 1044.8333587646484, + "epoch": 0.016615853658536586, + "grad_norm": 0.12194022821315956, + "kl": 0.003307342529296875, + "learning_rate": 4.984756097560976e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 109 + }, + { + "completion_length": 694.3333435058594, + "epoch": 0.01676829268292683, + "grad_norm": 2.3508164993284586, + "kl": 0.003936767578125, + "learning_rate": 5.030487804878049e-07, + "loss": 0.0002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 110 + }, + { + "completion_length": 651.1666717529297, + "epoch": 0.016920731707317074, + "grad_norm": 0.1711998230192789, + "kl": 0.00457763671875, + "learning_rate": 5.076219512195122e-07, + "loss": 0.0002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 111 + }, + { + "completion_length": 756.0, + "epoch": 0.01707317073170732, + "grad_norm": 0.06776078419373109, + "kl": 0.002227783203125, + "learning_rate": 5.121951219512195e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 112 + }, + { + "completion_length": 1190.166748046875, + "epoch": 0.017225609756097562, + "grad_norm": 0.04445051919287615, + "kl": 0.001827239990234375, + "learning_rate": 5.167682926829268e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 113 + }, + { + "completion_length": 2431.33349609375, + "epoch": 0.017378048780487806, + "grad_norm": 0.03541480472440673, + "kl": 0.0010223388671875, + "learning_rate": 5.213414634146341e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 114 + }, + { + "completion_length": 1827.8334350585938, + "epoch": 0.01753048780487805, + "grad_norm": 0.02748610053278798, + "kl": 0.000957489013671875, + "learning_rate": 5.259146341463414e-07, + "loss": 0.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 115 + }, + { + "completion_length": 906.3333740234375, + "epoch": 0.01768292682926829, + "grad_norm": 0.03769932600785743, + "kl": 0.00131988525390625, + "learning_rate": 5.304878048780488e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 116 + }, + { + "completion_length": 1149.0000305175781, + "epoch": 0.017835365853658535, + "grad_norm": 0.2694331651276358, + "kl": 0.00424957275390625, + "learning_rate": 5.350609756097561e-07, + "loss": 0.0002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 117 + }, + { + "completion_length": 1360.1666870117188, + "epoch": 0.01798780487804878, + "grad_norm": 0.10352136063148636, + "kl": 0.00154876708984375, + "learning_rate": 5.396341463414634e-07, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 118 + }, + { + "completion_length": 755.3333740234375, + "epoch": 0.018140243902439023, + "grad_norm": 0.30891114063313313, + "kl": 0.004398345947265625, + "learning_rate": 5.442073170731707e-07, + "loss": 0.0002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 119 + }, + { + "completion_length": 1389.1667175292969, + "epoch": 0.018292682926829267, + "grad_norm": 2.1900023994866986, + "kl": 0.002197265625, + "learning_rate": 5.48780487804878e-07, + "loss": 0.0001, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.1666666716337204, + "step": 120 + }, + { + "completion_length": 920.8333435058594, + "epoch": 0.01844512195121951, + "grad_norm": 0.030889411752991713, + "kl": 0.0007038116455078125, + "learning_rate": 5.533536585365854e-07, + "loss": 0.0, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 121 + }, + { + "completion_length": 1338.666748046875, + "epoch": 0.018597560975609755, + "grad_norm": 0.0386197675887802, + "kl": 0.001644134521484375, + "learning_rate": 5.579268292682927e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 122 + }, + { + "completion_length": 1546.3333435058594, + "epoch": 0.01875, + "grad_norm": 0.04662280862491571, + "kl": 0.001422882080078125, + "learning_rate": 5.625e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 123 + }, + { + "completion_length": 944.8333740234375, + "epoch": 0.018902439024390243, + "grad_norm": 0.056063083066018204, + "kl": 0.0023345947265625, + "learning_rate": 5.670731707317073e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 124 + }, + { + "completion_length": 944.6666870117188, + "epoch": 0.019054878048780487, + "grad_norm": 0.04468780833112194, + "kl": 0.001575469970703125, + "learning_rate": 5.716463414634146e-07, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 125 + }, + { + "completion_length": 818.3333740234375, + "epoch": 0.01920731707317073, + "grad_norm": 6.700456387751431, + "kl": 0.04503631591796875, + "learning_rate": 5.76219512195122e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 126 + }, + { + "completion_length": 1007.6666870117188, + "epoch": 0.019359756097560975, + "grad_norm": 2.3420803006803106, + "kl": 0.001819610595703125, + "learning_rate": 5.807926829268293e-07, + "loss": 0.0001, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 127 + }, + { + "completion_length": 1342.6666870117188, + "epoch": 0.01951219512195122, + "grad_norm": 0.08896010798123181, + "kl": 0.002307891845703125, + "learning_rate": 5.853658536585366e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 128 + }, + { + "completion_length": 798.8333435058594, + "epoch": 0.019664634146341464, + "grad_norm": 0.1043121745938157, + "kl": 0.00351715087890625, + "learning_rate": 5.899390243902439e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 129 + }, + { + "completion_length": 860.5000305175781, + "epoch": 0.019817073170731708, + "grad_norm": 0.0736954324635944, + "kl": 0.00272369384765625, + "learning_rate": 5.945121951219512e-07, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 130 + }, + { + "completion_length": 967.5000305175781, + "epoch": 0.01996951219512195, + "grad_norm": 0.097839460902875, + "kl": 0.003662109375, + "learning_rate": 5.990853658536586e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 131 + }, + { + "completion_length": 1154.8333740234375, + "epoch": 0.020121951219512196, + "grad_norm": 0.08979027507724219, + "kl": 0.003631591796875, + "learning_rate": 6.036585365853659e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 132 + }, + { + "completion_length": 730.0000305175781, + "epoch": 0.02027439024390244, + "grad_norm": 0.11678467893047656, + "kl": 0.00384521484375, + "learning_rate": 6.082317073170732e-07, + "loss": 0.0002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 133 + }, + { + "completion_length": 3381.0, + "epoch": 0.020426829268292684, + "grad_norm": 0.03368517360760903, + "kl": 0.001491546630859375, + "learning_rate": 6.128048780487805e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 134 + }, + { + "completion_length": 977.0000305175781, + "epoch": 0.020579268292682928, + "grad_norm": 0.11450257403329192, + "kl": 0.00478363037109375, + "learning_rate": 6.173780487804878e-07, + "loss": 0.0002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 135 + }, + { + "completion_length": 691.8333435058594, + "epoch": 0.020731707317073172, + "grad_norm": 0.06376218119561698, + "kl": 0.00278472900390625, + "learning_rate": 6.219512195121951e-07, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 136 + }, + { + "completion_length": 764.1666870117188, + "epoch": 0.020884146341463416, + "grad_norm": 3.1490372561089015, + "kl": 0.00304412841796875, + "learning_rate": 6.265243902439024e-07, + "loss": 0.0001, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 137 + }, + { + "completion_length": 1243.666748046875, + "epoch": 0.02103658536585366, + "grad_norm": 0.04818701129306559, + "kl": 0.001522064208984375, + "learning_rate": 6.310975609756097e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 138 + }, + { + "completion_length": 865.5000305175781, + "epoch": 0.021189024390243904, + "grad_norm": 0.04783773522567232, + "kl": 0.003326416015625, + "learning_rate": 6.356707317073171e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 139 + }, + { + "completion_length": 818.1666870117188, + "epoch": 0.021341463414634148, + "grad_norm": 0.06864265773062184, + "kl": 0.005828857421875, + "learning_rate": 6.402439024390244e-07, + "loss": 0.0002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 140 + }, + { + "completion_length": 986.0000610351562, + "epoch": 0.02149390243902439, + "grad_norm": 0.13536310036057006, + "kl": 0.005157470703125, + "learning_rate": 6.448170731707317e-07, + "loss": 0.0002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 141 + }, + { + "completion_length": 931.1666870117188, + "epoch": 0.021646341463414633, + "grad_norm": 0.05380257810236164, + "kl": 0.0030670166015625, + "learning_rate": 6.49390243902439e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 142 + }, + { + "completion_length": 1607.0, + "epoch": 0.021798780487804877, + "grad_norm": 0.06345561243195946, + "kl": 0.0027923583984375, + "learning_rate": 6.539634146341463e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 143 + }, + { + "completion_length": 868.3333435058594, + "epoch": 0.02195121951219512, + "grad_norm": 0.0532185459016492, + "kl": 0.003082275390625, + "learning_rate": 6.585365853658537e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 144 + }, + { + "completion_length": 1028.3333740234375, + "epoch": 0.022103658536585365, + "grad_norm": 0.039818895655630036, + "kl": 0.00197601318359375, + "learning_rate": 6.63109756097561e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 145 + }, + { + "completion_length": 811.5, + "epoch": 0.02225609756097561, + "grad_norm": 0.0543920155031363, + "kl": 0.00313568115234375, + "learning_rate": 6.676829268292683e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 146 + }, + { + "completion_length": 1060.0000305175781, + "epoch": 0.022408536585365853, + "grad_norm": 0.0368250916170557, + "kl": 0.00162506103515625, + "learning_rate": 6.722560975609756e-07, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 147 + }, + { + "completion_length": 889.0000305175781, + "epoch": 0.022560975609756097, + "grad_norm": 0.04483696885235642, + "kl": 0.0044708251953125, + "learning_rate": 6.768292682926829e-07, + "loss": 0.0002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 148 + }, + { + "completion_length": 803.5000305175781, + "epoch": 0.02271341463414634, + "grad_norm": 0.0443629442496357, + "kl": 0.00218963623046875, + "learning_rate": 6.814024390243903e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 149 + }, + { + "completion_length": 1003.0000305175781, + "epoch": 0.022865853658536585, + "grad_norm": 0.03986393718508334, + "kl": 0.001667022705078125, + "learning_rate": 6.859756097560976e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 150 + }, + { + "completion_length": 929.5000305175781, + "epoch": 0.02301829268292683, + "grad_norm": 0.08127054698322589, + "kl": 0.00200653076171875, + "learning_rate": 6.905487804878049e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 151 + }, + { + "completion_length": 2202.1666870117188, + "epoch": 0.023170731707317073, + "grad_norm": 0.049130317101112896, + "kl": 0.00243377685546875, + "learning_rate": 6.951219512195122e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 152 + }, + { + "completion_length": 1323.8333740234375, + "epoch": 0.023323170731707317, + "grad_norm": 0.03555426580444572, + "kl": 0.00189971923828125, + "learning_rate": 6.996951219512195e-07, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 153 + }, + { + "completion_length": 658.1666870117188, + "epoch": 0.02347560975609756, + "grad_norm": 2.522066284811479, + "kl": 0.0298309326171875, + "learning_rate": 7.042682926829268e-07, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 154 + }, + { + "completion_length": 690.8333435058594, + "epoch": 0.023628048780487805, + "grad_norm": 0.060889899755031486, + "kl": 0.003383636474609375, + "learning_rate": 7.088414634146341e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 155 + }, + { + "completion_length": 1301.8334197998047, + "epoch": 0.02378048780487805, + "grad_norm": 0.04159496878107686, + "kl": 0.002105712890625, + "learning_rate": 7.134146341463414e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 156 + }, + { + "completion_length": 973.3333435058594, + "epoch": 0.023932926829268293, + "grad_norm": 2.941975191106838, + "kl": 0.001392364501953125, + "learning_rate": 7.179878048780488e-07, + "loss": 0.0001, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 157 + }, + { + "completion_length": 820.5, + "epoch": 0.024085365853658537, + "grad_norm": 0.0347331447688593, + "kl": 0.0016632080078125, + "learning_rate": 7.225609756097561e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 158 + }, + { + "completion_length": 1198.166748046875, + "epoch": 0.02423780487804878, + "grad_norm": 0.032597393865735055, + "kl": 0.00159454345703125, + "learning_rate": 7.271341463414634e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 159 + }, + { + "completion_length": 787.3333435058594, + "epoch": 0.024390243902439025, + "grad_norm": 0.03638172927559483, + "kl": 0.001438140869140625, + "learning_rate": 7.317073170731707e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 160 + }, + { + "completion_length": 896.8333740234375, + "epoch": 0.02454268292682927, + "grad_norm": 2.2200707363116092, + "kl": 0.001983642578125, + "learning_rate": 7.36280487804878e-07, + "loss": 0.0001, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 161 + }, + { + "completion_length": 1181.8333740234375, + "epoch": 0.024695121951219513, + "grad_norm": 2.63892024855118, + "kl": 0.00238037109375, + "learning_rate": 7.408536585365854e-07, + "loss": 0.0001, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 162 + }, + { + "completion_length": 977.8333740234375, + "epoch": 0.024847560975609757, + "grad_norm": 0.06675086407858402, + "kl": 0.00208282470703125, + "learning_rate": 7.454268292682927e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 163 + }, + { + "completion_length": 1607.5001220703125, + "epoch": 0.025, + "grad_norm": 0.03270604498586388, + "kl": 0.00177001953125, + "learning_rate": 7.5e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 164 + }, + { + "completion_length": 2455.5, + "epoch": 0.025152439024390245, + "grad_norm": 0.030374984206630568, + "kl": 0.001476287841796875, + "learning_rate": 7.545731707317073e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 165 + }, + { + "completion_length": 1440.6666870117188, + "epoch": 0.02530487804878049, + "grad_norm": 0.03432228649761483, + "kl": 0.001445770263671875, + "learning_rate": 7.591463414634146e-07, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 166 + }, + { + "completion_length": 997.8333740234375, + "epoch": 0.02545731707317073, + "grad_norm": 0.07210643188732806, + "kl": 0.0030670166015625, + "learning_rate": 7.63719512195122e-07, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 167 + }, + { + "completion_length": 926.3333740234375, + "epoch": 0.025609756097560974, + "grad_norm": 0.05213745084335929, + "kl": 0.0031890869140625, + "learning_rate": 7.682926829268293e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 168 + }, + { + "completion_length": 634.1666870117188, + "epoch": 0.025762195121951218, + "grad_norm": 0.0919705225726629, + "kl": 0.00440216064453125, + "learning_rate": 7.728658536585366e-07, + "loss": 0.0002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 169 + }, + { + "completion_length": 906.6666870117188, + "epoch": 0.025914634146341462, + "grad_norm": 0.0548391453921814, + "kl": 0.00208282470703125, + "learning_rate": 7.774390243902439e-07, + "loss": 0.0001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 170 + }, + { + "completion_length": 1105.5000610351562, + "epoch": 0.026067073170731706, + "grad_norm": 0.049317021140862116, + "kl": 0.002532958984375, + "learning_rate": 7.820121951219512e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 171 + }, + { + "completion_length": 2141.3333740234375, + "epoch": 0.02621951219512195, + "grad_norm": 0.04463851707695693, + "kl": 0.00247955322265625, + "learning_rate": 7.865853658536586e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 172 + }, + { + "completion_length": 1005.6667175292969, + "epoch": 0.026371951219512194, + "grad_norm": 0.1810637973241244, + "kl": 0.00418853759765625, + "learning_rate": 7.911585365853659e-07, + "loss": 0.0002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 173 + }, + { + "completion_length": 1565.3333740234375, + "epoch": 0.02652439024390244, + "grad_norm": 0.056707976861180405, + "kl": 0.002716064453125, + "learning_rate": 7.957317073170732e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 174 + }, + { + "completion_length": 1147.0000305175781, + "epoch": 0.026676829268292682, + "grad_norm": 0.05769214468969742, + "kl": 0.002643585205078125, + "learning_rate": 8.003048780487805e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 175 + }, + { + "completion_length": 953.6666870117188, + "epoch": 0.026829268292682926, + "grad_norm": 0.05923043405845328, + "kl": 0.00325775146484375, + "learning_rate": 8.048780487804879e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 176 + }, + { + "completion_length": 881.1666870117188, + "epoch": 0.02698170731707317, + "grad_norm": 0.049079886650968844, + "kl": 0.002025604248046875, + "learning_rate": 8.094512195121952e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 177 + }, + { + "completion_length": 761.8333740234375, + "epoch": 0.027134146341463414, + "grad_norm": 0.058315496109511, + "kl": 0.0025787353515625, + "learning_rate": 8.140243902439025e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 178 + }, + { + "completion_length": 2250.0, + "epoch": 0.02728658536585366, + "grad_norm": 0.061682288147981816, + "kl": 0.0016002655029296875, + "learning_rate": 8.185975609756098e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 179 + }, + { + "completion_length": 1722.0000457763672, + "epoch": 0.027439024390243903, + "grad_norm": 0.1621929655718173, + "kl": 0.003383636474609375, + "learning_rate": 8.231707317073171e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 180 + }, + { + "completion_length": 2184.8333740234375, + "epoch": 0.027591463414634147, + "grad_norm": 0.02904658178269762, + "kl": 0.001590728759765625, + "learning_rate": 8.277439024390245e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 181 + }, + { + "completion_length": 567.0000152587891, + "epoch": 0.02774390243902439, + "grad_norm": 2.920958185692318, + "kl": 0.00402069091796875, + "learning_rate": 8.323170731707318e-07, + "loss": 0.0002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 182 + }, + { + "completion_length": 1511.1666870117188, + "epoch": 0.027896341463414635, + "grad_norm": 0.030321587352051923, + "kl": 0.001659393310546875, + "learning_rate": 8.368902439024391e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 183 + }, + { + "completion_length": 948.8333435058594, + "epoch": 0.02804878048780488, + "grad_norm": 2.6512519355382014, + "kl": 0.002593994140625, + "learning_rate": 8.414634146341464e-07, + "loss": 0.0001, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 184 + }, + { + "completion_length": 1151.3333740234375, + "epoch": 0.028201219512195123, + "grad_norm": 0.03383027067817121, + "kl": 0.002227783203125, + "learning_rate": 8.460365853658536e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 185 + }, + { + "completion_length": 891.3333740234375, + "epoch": 0.028353658536585367, + "grad_norm": 2.2452887352296864, + "kl": 0.003662109375, + "learning_rate": 8.506097560975609e-07, + "loss": 0.0001, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 186 + }, + { + "completion_length": 1816.0, + "epoch": 0.02850609756097561, + "grad_norm": 2.549970653295886, + "kl": 0.0021820068359375, + "learning_rate": 8.551829268292682e-07, + "loss": 0.0001, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 187 + }, + { + "completion_length": 1342.0000915527344, + "epoch": 0.028658536585365855, + "grad_norm": 2.64099233037992, + "kl": 0.00484466552734375, + "learning_rate": 8.597560975609755e-07, + "loss": 0.0002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 188 + }, + { + "completion_length": 2034.3334350585938, + "epoch": 0.0288109756097561, + "grad_norm": 0.06608823922097398, + "kl": 0.003387451171875, + "learning_rate": 8.643292682926829e-07, + "loss": 0.0001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 189 + }, + { + "completion_length": 802.3333740234375, + "epoch": 0.028963414634146343, + "grad_norm": 0.09237471986723945, + "kl": 0.00540924072265625, + "learning_rate": 8.689024390243902e-07, + "loss": 0.0002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 190 + }, + { + "completion_length": 866.6667175292969, + "epoch": 0.029115853658536587, + "grad_norm": 0.05760553074086886, + "kl": 0.00408172607421875, + "learning_rate": 8.734756097560975e-07, + "loss": 0.0002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 191 + }, + { + "completion_length": 1104.3333740234375, + "epoch": 0.02926829268292683, + "grad_norm": 3.1419266295695683, + "kl": 0.00951385498046875, + "learning_rate": 8.780487804878048e-07, + "loss": 0.0004, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 192 + }, + { + "completion_length": 1510.1666870117188, + "epoch": 0.02942073170731707, + "grad_norm": 1.340172476678439, + "kl": 0.0066680908203125, + "learning_rate": 8.826219512195121e-07, + "loss": 0.0003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 193 + }, + { + "completion_length": 1322.5, + "epoch": 0.029573170731707316, + "grad_norm": 3.5381459229510197, + "kl": 0.00958251953125, + "learning_rate": 8.871951219512195e-07, + "loss": 0.0004, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 194 + }, + { + "completion_length": 1165.5000610351562, + "epoch": 0.02972560975609756, + "grad_norm": 0.07231373992640522, + "kl": 0.004791259765625, + "learning_rate": 8.917682926829268e-07, + "loss": 0.0002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 195 + }, + { + "completion_length": 1183.5, + "epoch": 0.029878048780487804, + "grad_norm": 0.09661549762800108, + "kl": 0.008392333984375, + "learning_rate": 8.963414634146341e-07, + "loss": 0.0003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 196 + }, + { + "completion_length": 887.3333435058594, + "epoch": 0.030030487804878048, + "grad_norm": 0.12943059628936626, + "kl": 0.0071258544921875, + "learning_rate": 9.009146341463414e-07, + "loss": 0.0003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 197 + }, + { + "completion_length": 1470.8333740234375, + "epoch": 0.030182926829268292, + "grad_norm": 0.07664757250048371, + "kl": 0.0048980712890625, + "learning_rate": 9.054878048780488e-07, + "loss": 0.0002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 198 + }, + { + "completion_length": 1357.1666870117188, + "epoch": 0.030335365853658536, + "grad_norm": 0.27146342977640725, + "kl": 0.0110321044921875, + "learning_rate": 9.100609756097561e-07, + "loss": 0.0004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 199 + }, + { + "completion_length": 995.0000305175781, + "epoch": 0.03048780487804878, + "grad_norm": 0.07073299706315002, + "kl": 0.0063934326171875, + "learning_rate": 9.146341463414634e-07, + "loss": 0.0003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 200 + }, + { + "completion_length": 859.0000305175781, + "epoch": 0.030640243902439024, + "grad_norm": 2.709887397572493, + "kl": 0.0080718994140625, + "learning_rate": 9.192073170731707e-07, + "loss": 0.0003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.1666666716337204, + "step": 201 + }, + { + "completion_length": 675.1666717529297, + "epoch": 0.030792682926829268, + "grad_norm": 0.18412544047829096, + "kl": 0.0103912353515625, + "learning_rate": 9.23780487804878e-07, + "loss": 0.0004, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 202 + }, + { + "completion_length": 805.0000305175781, + "epoch": 0.030945121951219512, + "grad_norm": 1.8484339741369522, + "kl": 0.00762939453125, + "learning_rate": 9.283536585365854e-07, + "loss": 0.0003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 203 + }, + { + "completion_length": 913.6666870117188, + "epoch": 0.031097560975609756, + "grad_norm": 2.715815488440512, + "kl": 0.009796142578125, + "learning_rate": 9.329268292682927e-07, + "loss": 0.0004, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 204 + }, + { + "completion_length": 890.1666870117188, + "epoch": 0.03125, + "grad_norm": 0.14343465277970047, + "kl": 0.0086212158203125, + "learning_rate": 9.375e-07, + "loss": 0.0003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 205 + }, + { + "completion_length": 811.1666870117188, + "epoch": 0.031402439024390244, + "grad_norm": 1.747671515979953, + "kl": 0.01165771484375, + "learning_rate": 9.420731707317073e-07, + "loss": 0.0005, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 206 + }, + { + "completion_length": 786.3333435058594, + "epoch": 0.03155487804878049, + "grad_norm": 1.8155594585155181, + "kl": 0.012176513671875, + "learning_rate": 9.466463414634146e-07, + "loss": 0.0005, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 207 + }, + { + "completion_length": 1218.0000610351562, + "epoch": 0.03170731707317073, + "grad_norm": 0.14433349268188103, + "kl": 0.006805419921875, + "learning_rate": 9.51219512195122e-07, + "loss": 0.0003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 208 + }, + { + "completion_length": 679.8333740234375, + "epoch": 0.031859756097560976, + "grad_norm": 0.17660381730023675, + "kl": 0.012298583984375, + "learning_rate": 9.557926829268293e-07, + "loss": 0.0005, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 209 + }, + { + "completion_length": 865.0000305175781, + "epoch": 0.03201219512195122, + "grad_norm": 0.12599640159845446, + "kl": 0.010894775390625, + "learning_rate": 9.603658536585366e-07, + "loss": 0.0004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 210 + }, + { + "completion_length": 1145.5000610351562, + "epoch": 0.032164634146341464, + "grad_norm": 0.14261221988471623, + "kl": 0.01031494140625, + "learning_rate": 9.64939024390244e-07, + "loss": 0.0004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 211 + }, + { + "completion_length": 828.5, + "epoch": 0.03231707317073171, + "grad_norm": 0.12838963794285568, + "kl": 0.0118408203125, + "learning_rate": 9.695121951219512e-07, + "loss": 0.0005, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 212 + }, + { + "completion_length": 1019.1667175292969, + "epoch": 0.03246951219512195, + "grad_norm": 0.16550628800232278, + "kl": 0.012725830078125, + "learning_rate": 9.740853658536585e-07, + "loss": 0.0005, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 213 + }, + { + "completion_length": 720.5000305175781, + "epoch": 0.032621951219512196, + "grad_norm": 2.043189134739684, + "kl": 0.01312255859375, + "learning_rate": 9.786585365853658e-07, + "loss": 0.0005, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 214 + }, + { + "completion_length": 716.1666870117188, + "epoch": 0.03277439024390244, + "grad_norm": 0.06864533862272519, + "kl": 0.00702667236328125, + "learning_rate": 9.832317073170733e-07, + "loss": 0.0003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 215 + }, + { + "completion_length": 1287.6666870117188, + "epoch": 0.032926829268292684, + "grad_norm": 0.08550526738242488, + "kl": 0.0111083984375, + "learning_rate": 9.878048780487806e-07, + "loss": 0.0004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 216 + }, + { + "completion_length": 1429.3333435058594, + "epoch": 0.03307926829268293, + "grad_norm": 0.061573302442353865, + "kl": 0.00811767578125, + "learning_rate": 9.92378048780488e-07, + "loss": 0.0003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 217 + }, + { + "completion_length": 1014.8333740234375, + "epoch": 0.03323170731707317, + "grad_norm": 0.09435114883860926, + "kl": 0.0111083984375, + "learning_rate": 9.969512195121952e-07, + "loss": 0.0004, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 218 + }, + { + "completion_length": 1248.5, + "epoch": 0.033384146341463417, + "grad_norm": 1.8119305694463037, + "kl": 0.009429931640625, + "learning_rate": 1.0015243902439025e-06, + "loss": 0.0004, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 219 + }, + { + "completion_length": 1934.5000610351562, + "epoch": 0.03353658536585366, + "grad_norm": 0.08742857625472933, + "kl": 0.0076446533203125, + "learning_rate": 1.0060975609756098e-06, + "loss": 0.0003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 220 + }, + { + "completion_length": 692.8333435058594, + "epoch": 0.033689024390243905, + "grad_norm": 0.1555533232723732, + "kl": 0.011260986328125, + "learning_rate": 1.0106707317073171e-06, + "loss": 0.0004, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 221 + }, + { + "completion_length": 1302.0000305175781, + "epoch": 0.03384146341463415, + "grad_norm": 0.08899392615123973, + "kl": 0.0109710693359375, + "learning_rate": 1.0152439024390244e-06, + "loss": 0.0004, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 222 + }, + { + "completion_length": 1133.1666717529297, + "epoch": 0.03399390243902439, + "grad_norm": 0.1174936843416528, + "kl": 0.013458251953125, + "learning_rate": 1.0198170731707317e-06, + "loss": 0.0005, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 223 + }, + { + "completion_length": 1184.6667175292969, + "epoch": 0.03414634146341464, + "grad_norm": 0.08781976709362659, + "kl": 0.0085601806640625, + "learning_rate": 1.024390243902439e-06, + "loss": 0.0003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 224 + }, + { + "completion_length": 1587.8334350585938, + "epoch": 0.03429878048780488, + "grad_norm": 0.05430751766019397, + "kl": 0.0082550048828125, + "learning_rate": 1.0289634146341465e-06, + "loss": 0.0003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 225 + }, + { + "completion_length": 1062.3333740234375, + "epoch": 0.034451219512195125, + "grad_norm": 0.06923883440781216, + "kl": 0.01019287109375, + "learning_rate": 1.0335365853658536e-06, + "loss": 0.0004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 226 + }, + { + "completion_length": 996.3333740234375, + "epoch": 0.03460365853658537, + "grad_norm": 0.05692868596551738, + "kl": 0.006256103515625, + "learning_rate": 1.038109756097561e-06, + "loss": 0.0002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 227 + }, + { + "completion_length": 1271.3333740234375, + "epoch": 0.03475609756097561, + "grad_norm": 1.8715769646754623, + "kl": 0.013702392578125, + "learning_rate": 1.0426829268292682e-06, + "loss": 0.0005, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 228 + }, + { + "completion_length": 1055.0000305175781, + "epoch": 0.03490853658536586, + "grad_norm": 3.305191834552786, + "kl": 0.01068115234375, + "learning_rate": 1.0472560975609755e-06, + "loss": 0.0004, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 229 + }, + { + "completion_length": 1094.5000610351562, + "epoch": 0.0350609756097561, + "grad_norm": 2.9060449457337767, + "kl": 0.013427734375, + "learning_rate": 1.0518292682926828e-06, + "loss": 0.0005, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 230 + }, + { + "completion_length": 1165.1666870117188, + "epoch": 0.03521341463414634, + "grad_norm": 2.0203866643545907, + "kl": 0.01116943359375, + "learning_rate": 1.0564024390243901e-06, + "loss": 0.0004, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 231 + }, + { + "completion_length": 1103.0, + "epoch": 0.03536585365853658, + "grad_norm": 0.09894386352494838, + "kl": 0.009063720703125, + "learning_rate": 1.0609756097560976e-06, + "loss": 0.0004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 232 + }, + { + "completion_length": 1071.6667175292969, + "epoch": 0.035518292682926826, + "grad_norm": 0.12479620367602495, + "kl": 0.011871337890625, + "learning_rate": 1.065548780487805e-06, + "loss": 0.0005, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 233 + }, + { + "completion_length": 1648.0000610351562, + "epoch": 0.03567073170731707, + "grad_norm": 0.05734496569848097, + "kl": 0.01007080078125, + "learning_rate": 1.0701219512195122e-06, + "loss": 0.0004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 234 + }, + { + "completion_length": 1248.8333435058594, + "epoch": 0.035823170731707314, + "grad_norm": 0.15852428736253352, + "kl": 0.01544189453125, + "learning_rate": 1.0746951219512195e-06, + "loss": 0.0006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 235 + }, + { + "completion_length": 1460.8333740234375, + "epoch": 0.03597560975609756, + "grad_norm": 1.8295787864775213, + "kl": 0.015289306640625, + "learning_rate": 1.0792682926829268e-06, + "loss": 0.0006, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 236 + }, + { + "completion_length": 1382.1666870117188, + "epoch": 0.0361280487804878, + "grad_norm": 0.07909731974585353, + "kl": 0.01104736328125, + "learning_rate": 1.0838414634146341e-06, + "loss": 0.0004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 237 + }, + { + "completion_length": 1239.8333435058594, + "epoch": 0.036280487804878046, + "grad_norm": 0.07071095403035671, + "kl": 0.012847900390625, + "learning_rate": 1.0884146341463414e-06, + "loss": 0.0005, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 238 + }, + { + "completion_length": 693.6666870117188, + "epoch": 0.03643292682926829, + "grad_norm": 0.6107158541130476, + "kl": 0.02581787109375, + "learning_rate": 1.0929878048780487e-06, + "loss": 0.001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 239 + }, + { + "completion_length": 990.0000305175781, + "epoch": 0.036585365853658534, + "grad_norm": 0.13752775197729838, + "kl": 0.01531982421875, + "learning_rate": 1.097560975609756e-06, + "loss": 0.0006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 240 + }, + { + "completion_length": 931.5000305175781, + "epoch": 0.03673780487804878, + "grad_norm": 3.7150181865021286, + "kl": 0.020660400390625, + "learning_rate": 1.1021341463414633e-06, + "loss": 0.0008, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 241 + }, + { + "completion_length": 1255.3333740234375, + "epoch": 0.03689024390243902, + "grad_norm": 0.0859198323773749, + "kl": 0.01007080078125, + "learning_rate": 1.1067073170731708e-06, + "loss": 0.0004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 242 + }, + { + "completion_length": 1894.0, + "epoch": 0.037042682926829266, + "grad_norm": 2.5523589692766455, + "kl": 0.014617919921875, + "learning_rate": 1.1112804878048781e-06, + "loss": 0.0006, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 243 + }, + { + "completion_length": 1649.5000610351562, + "epoch": 0.03719512195121951, + "grad_norm": 0.06294540971819974, + "kl": 0.011505126953125, + "learning_rate": 1.1158536585365854e-06, + "loss": 0.0005, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 244 + }, + { + "completion_length": 740.8333435058594, + "epoch": 0.037347560975609755, + "grad_norm": 0.11421006408762611, + "kl": 0.01641845703125, + "learning_rate": 1.1204268292682927e-06, + "loss": 0.0007, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 245 + }, + { + "completion_length": 939.5, + "epoch": 0.0375, + "grad_norm": 0.10091673753297511, + "kl": 0.01727294921875, + "learning_rate": 1.125e-06, + "loss": 0.0007, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 246 + }, + { + "completion_length": 1282.3333740234375, + "epoch": 0.03765243902439024, + "grad_norm": 0.4431000688370316, + "kl": 0.015289306640625, + "learning_rate": 1.1295731707317073e-06, + "loss": 0.0006, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 247 + }, + { + "completion_length": 1566.666748046875, + "epoch": 0.03780487804878049, + "grad_norm": 0.07703582515336586, + "kl": 0.01611328125, + "learning_rate": 1.1341463414634146e-06, + "loss": 0.0006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 248 + }, + { + "completion_length": 1145.5000610351562, + "epoch": 0.03795731707317073, + "grad_norm": 0.857425621194326, + "kl": 0.03057861328125, + "learning_rate": 1.138719512195122e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 249 + }, + { + "completion_length": 979.5, + "epoch": 0.038109756097560975, + "grad_norm": 0.0819092006856299, + "kl": 0.008544921875, + "learning_rate": 1.1432926829268292e-06, + "loss": 0.0003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 250 + }, + { + "completion_length": 1186.0, + "epoch": 0.03826219512195122, + "grad_norm": 0.12892227040814075, + "kl": 0.01373291015625, + "learning_rate": 1.1478658536585367e-06, + "loss": 0.0006, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 251 + }, + { + "completion_length": 1055.6666870117188, + "epoch": 0.03841463414634146, + "grad_norm": 0.13528039409995596, + "kl": 0.013031005859375, + "learning_rate": 1.152439024390244e-06, + "loss": 0.0005, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 252 + }, + { + "completion_length": 562.6666870117188, + "epoch": 0.03856707317073171, + "grad_norm": 2.5398020684782194, + "kl": 0.0308837890625, + "learning_rate": 1.1570121951219513e-06, + "loss": 0.0012, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 253 + }, + { + "completion_length": 660.6666870117188, + "epoch": 0.03871951219512195, + "grad_norm": 2.4464675078125047, + "kl": 0.03216552734375, + "learning_rate": 1.1615853658536586e-06, + "loss": 0.0013, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 254 + }, + { + "completion_length": 1326.6666870117188, + "epoch": 0.038871951219512195, + "grad_norm": 0.1623263305026893, + "kl": 0.0194091796875, + "learning_rate": 1.166158536585366e-06, + "loss": 0.0008, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 255 + }, + { + "completion_length": 1523.3333740234375, + "epoch": 0.03902439024390244, + "grad_norm": 1.1750403102825178, + "kl": 0.01788330078125, + "learning_rate": 1.1707317073170732e-06, + "loss": 0.0007, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 256 + }, + { + "completion_length": 1260.5000610351562, + "epoch": 0.03917682926829268, + "grad_norm": 0.1482779287084118, + "kl": 0.02001953125, + "learning_rate": 1.1753048780487805e-06, + "loss": 0.0008, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 257 + }, + { + "completion_length": 620.5000305175781, + "epoch": 0.03932926829268293, + "grad_norm": 0.2317589311589632, + "kl": 0.026123046875, + "learning_rate": 1.1798780487804878e-06, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 258 + }, + { + "completion_length": 975.6666870117188, + "epoch": 0.03948170731707317, + "grad_norm": 0.09056799484054494, + "kl": 0.01763916015625, + "learning_rate": 1.1844512195121951e-06, + "loss": 0.0007, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 259 + }, + { + "completion_length": 1442.3333740234375, + "epoch": 0.039634146341463415, + "grad_norm": 0.15548837080650896, + "kl": 0.02178955078125, + "learning_rate": 1.1890243902439024e-06, + "loss": 0.0009, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 260 + }, + { + "completion_length": 1207.8333740234375, + "epoch": 0.03978658536585366, + "grad_norm": 0.0792811181038896, + "kl": 0.02130126953125, + "learning_rate": 1.19359756097561e-06, + "loss": 0.0009, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 261 + }, + { + "completion_length": 791.6666870117188, + "epoch": 0.0399390243902439, + "grad_norm": 0.19674818653587164, + "kl": 0.016021728515625, + "learning_rate": 1.1981707317073172e-06, + "loss": 0.0006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 262 + }, + { + "completion_length": 2292.8334350585938, + "epoch": 0.04009146341463415, + "grad_norm": 0.05597883194911081, + "kl": 0.012847900390625, + "learning_rate": 1.2027439024390245e-06, + "loss": 0.0005, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 263 + }, + { + "completion_length": 1180.8333740234375, + "epoch": 0.04024390243902439, + "grad_norm": 0.11713389943003126, + "kl": 0.0196533203125, + "learning_rate": 1.2073170731707318e-06, + "loss": 0.0008, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 264 + }, + { + "completion_length": 924.3333435058594, + "epoch": 0.040396341463414635, + "grad_norm": 15.492780455113007, + "kl": 0.3587646484375, + "learning_rate": 1.2118902439024391e-06, + "loss": 0.0144, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 265 + }, + { + "completion_length": 1625.1666870117188, + "epoch": 0.04054878048780488, + "grad_norm": 0.055723861789119415, + "kl": 0.01153564453125, + "learning_rate": 1.2164634146341464e-06, + "loss": 0.0005, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 266 + }, + { + "completion_length": 1329.5000610351562, + "epoch": 0.04070121951219512, + "grad_norm": 0.11361673303172945, + "kl": 0.0152587890625, + "learning_rate": 1.2210365853658535e-06, + "loss": 0.0006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 267 + }, + { + "completion_length": 1719.0000915527344, + "epoch": 0.04085365853658537, + "grad_norm": 0.08299406113222044, + "kl": 0.0155029296875, + "learning_rate": 1.225609756097561e-06, + "loss": 0.0006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 268 + }, + { + "completion_length": 1831.8333740234375, + "epoch": 0.04100609756097561, + "grad_norm": 0.0823099535933172, + "kl": 0.01385498046875, + "learning_rate": 1.2301829268292683e-06, + "loss": 0.0006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 269 + }, + { + "completion_length": 994.3333740234375, + "epoch": 0.041158536585365856, + "grad_norm": 0.6153708363487663, + "kl": 0.018585205078125, + "learning_rate": 1.2347560975609756e-06, + "loss": 0.0007, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 270 + }, + { + "completion_length": 1731.166748046875, + "epoch": 0.0413109756097561, + "grad_norm": 0.07412041024249774, + "kl": 0.015289306640625, + "learning_rate": 1.239329268292683e-06, + "loss": 0.0006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 271 + }, + { + "completion_length": 1028.1666870117188, + "epoch": 0.041463414634146344, + "grad_norm": 0.08832901611614799, + "kl": 0.018798828125, + "learning_rate": 1.2439024390243902e-06, + "loss": 0.0007, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 272 + }, + { + "completion_length": 998.6667175292969, + "epoch": 0.04161585365853659, + "grad_norm": 0.056840056037606126, + "kl": 0.011962890625, + "learning_rate": 1.2484756097560975e-06, + "loss": 0.0005, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 273 + }, + { + "completion_length": 2319.5001220703125, + "epoch": 0.04176829268292683, + "grad_norm": 0.08340965211759856, + "kl": 0.01409912109375, + "learning_rate": 1.2530487804878048e-06, + "loss": 0.0006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 274 + }, + { + "completion_length": 650.6666870117188, + "epoch": 0.041920731707317076, + "grad_norm": 2.743686732034047, + "kl": 0.0335693359375, + "learning_rate": 1.2576219512195121e-06, + "loss": 0.0013, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 275 + }, + { + "completion_length": 1081.0000610351562, + "epoch": 0.04207317073170732, + "grad_norm": 0.07903578886156645, + "kl": 0.01800537109375, + "learning_rate": 1.2621951219512194e-06, + "loss": 0.0007, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 276 + }, + { + "completion_length": 1650.8333435058594, + "epoch": 0.042225609756097564, + "grad_norm": 2.6548585310796073, + "kl": 0.02105712890625, + "learning_rate": 1.2667682926829267e-06, + "loss": 0.0008, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 277 + }, + { + "completion_length": 557.3333435058594, + "epoch": 0.04237804878048781, + "grad_norm": 3.7285656896633537, + "kl": 0.02008056640625, + "learning_rate": 1.2713414634146342e-06, + "loss": 0.0008, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 278 + }, + { + "completion_length": 931.6666870117188, + "epoch": 0.04253048780487805, + "grad_norm": 0.09205712528490359, + "kl": 0.022705078125, + "learning_rate": 1.2759146341463415e-06, + "loss": 0.0009, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 279 + }, + { + "completion_length": 1098.0, + "epoch": 0.042682926829268296, + "grad_norm": 2.3726495287266927, + "kl": 0.027587890625, + "learning_rate": 1.2804878048780488e-06, + "loss": 0.0011, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 280 + }, + { + "completion_length": 2512.166748046875, + "epoch": 0.04283536585365854, + "grad_norm": 0.06879902224281646, + "kl": 0.01556396484375, + "learning_rate": 1.2850609756097561e-06, + "loss": 0.0006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 281 + }, + { + "completion_length": 684.5, + "epoch": 0.04298780487804878, + "grad_norm": 0.8359152046791265, + "kl": 0.052734375, + "learning_rate": 1.2896341463414634e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 282 + }, + { + "completion_length": 1987.5000610351562, + "epoch": 0.04314024390243902, + "grad_norm": 0.40756869407188806, + "kl": 0.047607421875, + "learning_rate": 1.2942073170731707e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 283 + }, + { + "completion_length": 955.5, + "epoch": 0.043292682926829265, + "grad_norm": 9.841077685398863, + "kl": 0.115966796875, + "learning_rate": 1.298780487804878e-06, + "loss": 0.0047, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 284 + }, + { + "completion_length": 1427.0, + "epoch": 0.04344512195121951, + "grad_norm": 0.29023768146799245, + "kl": 0.0419921875, + "learning_rate": 1.3033536585365853e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 285 + }, + { + "completion_length": 615.0000305175781, + "epoch": 0.04359756097560975, + "grad_norm": 1038.3260781776664, + "kl": 2.1923828125, + "learning_rate": 1.3079268292682926e-06, + "loss": 0.0879, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 286 + }, + { + "completion_length": 1164.6666870117188, + "epoch": 0.04375, + "grad_norm": 0.1928277069829346, + "kl": 0.02471923828125, + "learning_rate": 1.3125000000000001e-06, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 287 + }, + { + "completion_length": 1923.8333740234375, + "epoch": 0.04390243902439024, + "grad_norm": 2.605563520800045, + "kl": 0.0391845703125, + "learning_rate": 1.3170731707317074e-06, + "loss": 0.0016, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 288 + }, + { + "completion_length": 1231.0000305175781, + "epoch": 0.044054878048780485, + "grad_norm": 3.083850266288167, + "kl": 0.0321044921875, + "learning_rate": 1.3216463414634147e-06, + "loss": 0.0013, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 289 + }, + { + "completion_length": 819.0000305175781, + "epoch": 0.04420731707317073, + "grad_norm": 0.15558960388213974, + "kl": 0.0279541015625, + "learning_rate": 1.326219512195122e-06, + "loss": 0.0011, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 290 + }, + { + "completion_length": 1025.5, + "epoch": 0.04435975609756097, + "grad_norm": 0.09911598980424215, + "kl": 0.03399658203125, + "learning_rate": 1.3307926829268293e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 291 + }, + { + "completion_length": 1945.0000915527344, + "epoch": 0.04451219512195122, + "grad_norm": 2.960576625728969, + "kl": 0.0455322265625, + "learning_rate": 1.3353658536585366e-06, + "loss": 0.0018, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 292 + }, + { + "completion_length": 2432.3334350585938, + "epoch": 0.04466463414634146, + "grad_norm": 0.08549492926947608, + "kl": 0.0277099609375, + "learning_rate": 1.339939024390244e-06, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 293 + }, + { + "completion_length": 579.5000152587891, + "epoch": 0.044817073170731705, + "grad_norm": 0.13169879334804005, + "kl": 0.03656005859375, + "learning_rate": 1.3445121951219512e-06, + "loss": 0.0015, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 294 + }, + { + "completion_length": 1388.0000610351562, + "epoch": 0.04496951219512195, + "grad_norm": 0.16005479606626333, + "kl": 0.03704833984375, + "learning_rate": 1.3490853658536585e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 295 + }, + { + "completion_length": 1281.3333435058594, + "epoch": 0.045121951219512194, + "grad_norm": 0.16401124021534189, + "kl": 0.0321044921875, + "learning_rate": 1.3536585365853658e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 296 + }, + { + "completion_length": 773.1666870117188, + "epoch": 0.04527439024390244, + "grad_norm": 0.1059622424229431, + "kl": 0.02935791015625, + "learning_rate": 1.3582317073170733e-06, + "loss": 0.0012, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 297 + }, + { + "completion_length": 987.8333435058594, + "epoch": 0.04542682926829268, + "grad_norm": 2.7597698911680424, + "kl": 0.0452880859375, + "learning_rate": 1.3628048780487806e-06, + "loss": 0.0018, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 298 + }, + { + "completion_length": 3073.0001220703125, + "epoch": 0.045579268292682926, + "grad_norm": 1.8401802654394437, + "kl": 0.02532958984375, + "learning_rate": 1.367378048780488e-06, + "loss": 0.001, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 299 + }, + { + "completion_length": 1877.6666870117188, + "epoch": 0.04573170731707317, + "grad_norm": 0.1523572982823696, + "kl": 0.02825927734375, + "learning_rate": 1.3719512195121952e-06, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 300 + }, + { + "completion_length": 1811.0, + "epoch": 0.045884146341463414, + "grad_norm": 2.405085702278499, + "kl": 0.0570068359375, + "learning_rate": 1.3765243902439025e-06, + "loss": 0.0023, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 301 + }, + { + "completion_length": 1981.166748046875, + "epoch": 0.04603658536585366, + "grad_norm": 0.19339827993915717, + "kl": 0.0450439453125, + "learning_rate": 1.3810975609756098e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 302 + }, + { + "completion_length": 2012.666748046875, + "epoch": 0.0461890243902439, + "grad_norm": 2.6396843764372284, + "kl": 0.0489501953125, + "learning_rate": 1.3856707317073171e-06, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 303 + }, + { + "completion_length": 1795.8334350585938, + "epoch": 0.046341463414634146, + "grad_norm": 0.2520143283648831, + "kl": 0.0372314453125, + "learning_rate": 1.3902439024390244e-06, + "loss": 0.0015, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 304 + }, + { + "completion_length": 2019.8333740234375, + "epoch": 0.04649390243902439, + "grad_norm": 0.09853133610269835, + "kl": 0.0255126953125, + "learning_rate": 1.3948170731707317e-06, + "loss": 0.001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 305 + }, + { + "completion_length": 2463.3333435058594, + "epoch": 0.046646341463414634, + "grad_norm": 0.13604840834462775, + "kl": 0.02593994140625, + "learning_rate": 1.399390243902439e-06, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 306 + }, + { + "completion_length": 890.6666870117188, + "epoch": 0.04679878048780488, + "grad_norm": 0.16255496141037856, + "kl": 0.03375244140625, + "learning_rate": 1.4039634146341465e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 307 + }, + { + "completion_length": 2177.666748046875, + "epoch": 0.04695121951219512, + "grad_norm": 0.09069688098398579, + "kl": 0.02716064453125, + "learning_rate": 1.4085365853658536e-06, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 308 + }, + { + "completion_length": 2546.0001220703125, + "epoch": 0.047103658536585366, + "grad_norm": 2.0308998602717288, + "kl": 0.031494140625, + "learning_rate": 1.413109756097561e-06, + "loss": 0.0013, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 309 + }, + { + "completion_length": 1910.5001220703125, + "epoch": 0.04725609756097561, + "grad_norm": 0.529340165662666, + "kl": 0.03375244140625, + "learning_rate": 1.4176829268292682e-06, + "loss": 0.0013, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 310 + }, + { + "completion_length": 768.5, + "epoch": 0.047408536585365854, + "grad_norm": 0.1543456710803181, + "kl": 0.02886962890625, + "learning_rate": 1.4222560975609755e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 311 + }, + { + "completion_length": 767.1666870117188, + "epoch": 0.0475609756097561, + "grad_norm": 3.5362492489146824, + "kl": 0.0396728515625, + "learning_rate": 1.4268292682926828e-06, + "loss": 0.0016, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 312 + }, + { + "completion_length": 1150.1666870117188, + "epoch": 0.04771341463414634, + "grad_norm": 1.807245427087521, + "kl": 0.03106689453125, + "learning_rate": 1.4314024390243901e-06, + "loss": 0.0012, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 313 + }, + { + "completion_length": 1334.8333740234375, + "epoch": 0.047865853658536586, + "grad_norm": 0.35790736191823985, + "kl": 0.03741455078125, + "learning_rate": 1.4359756097560976e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 314 + }, + { + "completion_length": 1085.5, + "epoch": 0.04801829268292683, + "grad_norm": 2.4307495014352005, + "kl": 0.0416259765625, + "learning_rate": 1.440548780487805e-06, + "loss": 0.0017, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 315 + }, + { + "completion_length": 1013.5000305175781, + "epoch": 0.048170731707317074, + "grad_norm": 0.18538206312519107, + "kl": 0.031005859375, + "learning_rate": 1.4451219512195122e-06, + "loss": 0.0012, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 316 + }, + { + "completion_length": 590.6666870117188, + "epoch": 0.04832317073170732, + "grad_norm": 0.13207547530786107, + "kl": 0.04791259765625, + "learning_rate": 1.4496951219512195e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 317 + }, + { + "completion_length": 1290.0, + "epoch": 0.04847560975609756, + "grad_norm": 0.11671902336432918, + "kl": 0.03155517578125, + "learning_rate": 1.4542682926829268e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 318 + }, + { + "completion_length": 1965.166748046875, + "epoch": 0.048628048780487806, + "grad_norm": 0.09490085327374231, + "kl": 0.03106689453125, + "learning_rate": 1.4588414634146341e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 319 + }, + { + "completion_length": 2016.3334350585938, + "epoch": 0.04878048780487805, + "grad_norm": 1.6762145612457802, + "kl": 0.0411376953125, + "learning_rate": 1.4634146341463414e-06, + "loss": 0.0016, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 320 + }, + { + "completion_length": 1064.0000305175781, + "epoch": 0.048932926829268295, + "grad_norm": 0.08852649176903889, + "kl": 0.02880859375, + "learning_rate": 1.4679878048780487e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 321 + }, + { + "completion_length": 804.6666870117188, + "epoch": 0.04908536585365854, + "grad_norm": 0.09039628208819145, + "kl": 0.030029296875, + "learning_rate": 1.472560975609756e-06, + "loss": 0.0012, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 322 + }, + { + "completion_length": 903.6666870117188, + "epoch": 0.04923780487804878, + "grad_norm": 0.18186196444811145, + "kl": 0.0379638671875, + "learning_rate": 1.4771341463414633e-06, + "loss": 0.0015, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 323 + }, + { + "completion_length": 1537.5000610351562, + "epoch": 0.04939024390243903, + "grad_norm": 2.1865633889858147, + "kl": 0.049560546875, + "learning_rate": 1.4817073170731708e-06, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 324 + }, + { + "completion_length": 2556.0001220703125, + "epoch": 0.04954268292682927, + "grad_norm": 1.7723141235306228, + "kl": 0.034423828125, + "learning_rate": 1.4862804878048781e-06, + "loss": 0.0014, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 325 + }, + { + "completion_length": 709.5, + "epoch": 0.049695121951219515, + "grad_norm": 0.6518186011463136, + "kl": 0.048828125, + "learning_rate": 1.4908536585365854e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 326 + }, + { + "completion_length": 1951.3333740234375, + "epoch": 0.04984756097560976, + "grad_norm": 0.08235145466304981, + "kl": 0.033203125, + "learning_rate": 1.4954268292682927e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 327 + }, + { + "completion_length": 1365.0, + "epoch": 0.05, + "grad_norm": 0.13043604652029725, + "kl": 0.02996826171875, + "learning_rate": 1.5e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 328 + }, + { + "completion_length": 1602.5000610351562, + "epoch": 0.05015243902439025, + "grad_norm": 1.9130403514976952, + "kl": 0.04736328125, + "learning_rate": 1.5045731707317071e-06, + "loss": 0.0019, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 329 + }, + { + "completion_length": 1994.3333740234375, + "epoch": 0.05030487804878049, + "grad_norm": 2.9711754145309346, + "kl": 0.0560302734375, + "learning_rate": 1.5091463414634146e-06, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 330 + }, + { + "completion_length": 1340.0000610351562, + "epoch": 0.050457317073170735, + "grad_norm": 0.06706257985313269, + "kl": 0.01800537109375, + "learning_rate": 1.513719512195122e-06, + "loss": 0.0007, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 331 + }, + { + "completion_length": 2143.666748046875, + "epoch": 0.05060975609756098, + "grad_norm": 0.5293524116189932, + "kl": 0.04498291015625, + "learning_rate": 1.5182926829268292e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 332 + }, + { + "completion_length": 1536.5000610351562, + "epoch": 0.05076219512195122, + "grad_norm": 0.12683905217028357, + "kl": 0.02886962890625, + "learning_rate": 1.5228658536585365e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 333 + }, + { + "completion_length": 1360.1666870117188, + "epoch": 0.05091463414634146, + "grad_norm": 0.2461321198008429, + "kl": 0.0391845703125, + "learning_rate": 1.527439024390244e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 334 + }, + { + "completion_length": 856.6666870117188, + "epoch": 0.051067073170731704, + "grad_norm": 0.12644728478672737, + "kl": 0.040771484375, + "learning_rate": 1.5320121951219511e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 335 + }, + { + "completion_length": 2191.3333435058594, + "epoch": 0.05121951219512195, + "grad_norm": 0.12658453721310284, + "kl": 0.03692626953125, + "learning_rate": 1.5365853658536586e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 336 + }, + { + "completion_length": 3319.666748046875, + "epoch": 0.05137195121951219, + "grad_norm": 0.09583504878949438, + "kl": 0.02789306640625, + "learning_rate": 1.5411585365853657e-06, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 337 + }, + { + "completion_length": 1774.666748046875, + "epoch": 0.051524390243902436, + "grad_norm": 0.1596290563501019, + "kl": 0.0338134765625, + "learning_rate": 1.5457317073170732e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 338 + }, + { + "completion_length": 2573.1666870117188, + "epoch": 0.05167682926829268, + "grad_norm": 0.11367946203840487, + "kl": 0.0242919921875, + "learning_rate": 1.5503048780487803e-06, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 339 + }, + { + "completion_length": 1603.0, + "epoch": 0.051829268292682924, + "grad_norm": 0.3177288065905332, + "kl": 0.0513916015625, + "learning_rate": 1.5548780487804878e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 340 + }, + { + "completion_length": 2551.0000610351562, + "epoch": 0.05198170731707317, + "grad_norm": 0.8806219023567317, + "kl": 0.02606201171875, + "learning_rate": 1.5594512195121951e-06, + "loss": 0.001, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 341 + }, + { + "completion_length": 1857.1666870117188, + "epoch": 0.05213414634146341, + "grad_norm": 0.12307031474335449, + "kl": 0.0335693359375, + "learning_rate": 1.5640243902439024e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 342 + }, + { + "completion_length": 2312.8333740234375, + "epoch": 0.052286585365853656, + "grad_norm": 0.10662168425214086, + "kl": 0.03955078125, + "learning_rate": 1.5685975609756097e-06, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 343 + }, + { + "completion_length": 3432.666748046875, + "epoch": 0.0524390243902439, + "grad_norm": 0.05588373258684943, + "kl": 0.02362060546875, + "learning_rate": 1.5731707317073172e-06, + "loss": 0.0009, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 344 + }, + { + "completion_length": 2783.8333740234375, + "epoch": 0.052591463414634144, + "grad_norm": 0.11362843847182455, + "kl": 0.0279541015625, + "learning_rate": 1.5777439024390243e-06, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 345 + }, + { + "completion_length": 1759.8333435058594, + "epoch": 0.05274390243902439, + "grad_norm": 0.16690312849417666, + "kl": 0.034423828125, + "learning_rate": 1.5823170731707318e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 346 + }, + { + "completion_length": 1070.5000610351562, + "epoch": 0.05289634146341463, + "grad_norm": 0.10475735678031971, + "kl": 0.02850341796875, + "learning_rate": 1.586890243902439e-06, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 347 + }, + { + "completion_length": 1004.3333740234375, + "epoch": 0.05304878048780488, + "grad_norm": 0.11585587765901706, + "kl": 0.041748046875, + "learning_rate": 1.5914634146341464e-06, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 348 + }, + { + "completion_length": 1899.3333740234375, + "epoch": 0.05320121951219512, + "grad_norm": 0.10642342197004173, + "kl": 0.03912353515625, + "learning_rate": 1.5960365853658535e-06, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 349 + }, + { + "completion_length": 1750.0, + "epoch": 0.053353658536585365, + "grad_norm": 0.11846846484515712, + "kl": 0.02490234375, + "learning_rate": 1.600609756097561e-06, + "loss": 0.001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 350 + }, + { + "completion_length": 905.1666870117188, + "epoch": 0.05350609756097561, + "grad_norm": 0.1629913060979647, + "kl": 0.027587890625, + "learning_rate": 1.6051829268292683e-06, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 351 + }, + { + "completion_length": 1650.8333740234375, + "epoch": 0.05365853658536585, + "grad_norm": 1.4383897775397336, + "kl": 0.0457763671875, + "learning_rate": 1.6097560975609759e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 352 + }, + { + "completion_length": 766.5, + "epoch": 0.0538109756097561, + "grad_norm": 2.710822722849833, + "kl": 0.04559326171875, + "learning_rate": 1.614329268292683e-06, + "loss": 0.0018, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 353 + }, + { + "completion_length": 2520.5000610351562, + "epoch": 0.05396341463414634, + "grad_norm": 0.08661654173097634, + "kl": 0.02484130859375, + "learning_rate": 1.6189024390243905e-06, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 354 + }, + { + "completion_length": 2066.1666870117188, + "epoch": 0.054115853658536585, + "grad_norm": 1.015542545784341, + "kl": 0.0279541015625, + "learning_rate": 1.6234756097560975e-06, + "loss": 0.0011, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 355 + }, + { + "completion_length": 1131.0000610351562, + "epoch": 0.05426829268292683, + "grad_norm": 0.06617005992224705, + "kl": 0.0194091796875, + "learning_rate": 1.628048780487805e-06, + "loss": 0.0008, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 356 + }, + { + "completion_length": 1308.6666870117188, + "epoch": 0.05442073170731707, + "grad_norm": 0.11235088062236805, + "kl": 0.0303955078125, + "learning_rate": 1.6326219512195121e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 357 + }, + { + "completion_length": 1800.3333740234375, + "epoch": 0.05457317073170732, + "grad_norm": 0.12096223424401203, + "kl": 0.02337646484375, + "learning_rate": 1.6371951219512196e-06, + "loss": 0.0009, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 358 + }, + { + "completion_length": 1684.3333435058594, + "epoch": 0.05472560975609756, + "grad_norm": 0.11295000698165056, + "kl": 0.030029296875, + "learning_rate": 1.6417682926829267e-06, + "loss": 0.0012, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 359 + }, + { + "completion_length": 745.1666870117188, + "epoch": 0.054878048780487805, + "grad_norm": 3.0364979535139303, + "kl": 0.03155517578125, + "learning_rate": 1.6463414634146342e-06, + "loss": 0.0013, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 360 + }, + { + "completion_length": 1375.3333435058594, + "epoch": 0.05503048780487805, + "grad_norm": 0.259054993432114, + "kl": 0.0318603515625, + "learning_rate": 1.6509146341463415e-06, + "loss": 0.0013, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 361 + }, + { + "completion_length": 1349.0000610351562, + "epoch": 0.05518292682926829, + "grad_norm": 2.4731513063295, + "kl": 0.03875732421875, + "learning_rate": 1.655487804878049e-06, + "loss": 0.0015, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 362 + }, + { + "completion_length": 1096.3333435058594, + "epoch": 0.05533536585365854, + "grad_norm": 0.1197679660658877, + "kl": 0.02923583984375, + "learning_rate": 1.6600609756097561e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 363 + }, + { + "completion_length": 703.0000152587891, + "epoch": 0.05548780487804878, + "grad_norm": 0.16262221899102872, + "kl": 0.040283203125, + "learning_rate": 1.6646341463414637e-06, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 364 + }, + { + "completion_length": 2283.1666870117188, + "epoch": 0.055640243902439025, + "grad_norm": 0.395266893136742, + "kl": 0.04205322265625, + "learning_rate": 1.6692073170731707e-06, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 365 + }, + { + "completion_length": 1222.0000610351562, + "epoch": 0.05579268292682927, + "grad_norm": 0.08448668589217363, + "kl": 0.016265869140625, + "learning_rate": 1.6737804878048783e-06, + "loss": 0.0006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 366 + }, + { + "completion_length": 2055.5, + "epoch": 0.05594512195121951, + "grad_norm": 5.929798246785172, + "kl": 0.060791015625, + "learning_rate": 1.6783536585365853e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 367 + }, + { + "completion_length": 995.6667175292969, + "epoch": 0.05609756097560976, + "grad_norm": 0.1371694267203231, + "kl": 0.03729248046875, + "learning_rate": 1.6829268292682928e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 368 + }, + { + "completion_length": 1465.0000610351562, + "epoch": 0.05625, + "grad_norm": 0.2089835461590819, + "kl": 0.03564453125, + "learning_rate": 1.6875e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 369 + }, + { + "completion_length": 875.0, + "epoch": 0.056402439024390245, + "grad_norm": 0.09466883675976374, + "kl": 0.02459716796875, + "learning_rate": 1.6920731707317072e-06, + "loss": 0.001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 370 + }, + { + "completion_length": 1075.3333435058594, + "epoch": 0.05655487804878049, + "grad_norm": 0.12121809739802748, + "kl": 0.0341796875, + "learning_rate": 1.6966463414634147e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 371 + }, + { + "completion_length": 923.1666870117188, + "epoch": 0.056707317073170734, + "grad_norm": 3.309685779866614, + "kl": 0.0321044921875, + "learning_rate": 1.7012195121951218e-06, + "loss": 0.0013, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 372 + }, + { + "completion_length": 1377.5000305175781, + "epoch": 0.05685975609756098, + "grad_norm": 0.09802111725248605, + "kl": 0.0313720703125, + "learning_rate": 1.7057926829268293e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 373 + }, + { + "completion_length": 910.6667175292969, + "epoch": 0.05701219512195122, + "grad_norm": 0.09534420014301664, + "kl": 0.0244140625, + "learning_rate": 1.7103658536585364e-06, + "loss": 0.001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 374 + }, + { + "completion_length": 1507.1666870117188, + "epoch": 0.057164634146341466, + "grad_norm": 0.08698663561355584, + "kl": 0.0291748046875, + "learning_rate": 1.714939024390244e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 375 + }, + { + "completion_length": 2955.8333740234375, + "epoch": 0.05731707317073171, + "grad_norm": 1.780447932076351, + "kl": 0.0294189453125, + "learning_rate": 1.719512195121951e-06, + "loss": 0.0012, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 376 + }, + { + "completion_length": 757.3333740234375, + "epoch": 0.057469512195121954, + "grad_norm": 0.0933832503172464, + "kl": 0.026123046875, + "learning_rate": 1.7240853658536585e-06, + "loss": 0.001, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 377 + }, + { + "completion_length": 1542.166748046875, + "epoch": 0.0576219512195122, + "grad_norm": 0.09598705867772599, + "kl": 0.0286865234375, + "learning_rate": 1.7286585365853658e-06, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 378 + }, + { + "completion_length": 1120.5000610351562, + "epoch": 0.05777439024390244, + "grad_norm": 0.09393010567120247, + "kl": 0.019287109375, + "learning_rate": 1.7332317073170734e-06, + "loss": 0.0008, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 379 + }, + { + "completion_length": 1209.0000610351562, + "epoch": 0.057926829268292686, + "grad_norm": 0.11252880431644986, + "kl": 0.02288818359375, + "learning_rate": 1.7378048780487804e-06, + "loss": 0.0009, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 380 + }, + { + "completion_length": 1746.3333740234375, + "epoch": 0.05807926829268293, + "grad_norm": 2.1876057368526243, + "kl": 0.0308837890625, + "learning_rate": 1.742378048780488e-06, + "loss": 0.0012, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 381 + }, + { + "completion_length": 933.3333740234375, + "epoch": 0.058231707317073174, + "grad_norm": 0.2624364507347309, + "kl": 0.0501708984375, + "learning_rate": 1.746951219512195e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 382 + }, + { + "completion_length": 659.1666870117188, + "epoch": 0.05838414634146342, + "grad_norm": 2.908726187375745, + "kl": 0.04443359375, + "learning_rate": 1.7515243902439025e-06, + "loss": 0.0018, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 383 + }, + { + "completion_length": 818.0000305175781, + "epoch": 0.05853658536585366, + "grad_norm": 0.1503233345278512, + "kl": 0.0277099609375, + "learning_rate": 1.7560975609756096e-06, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 384 + }, + { + "completion_length": 823.0000305175781, + "epoch": 0.0586890243902439, + "grad_norm": 0.16072190985954107, + "kl": 0.02032470703125, + "learning_rate": 1.7606707317073171e-06, + "loss": 0.0008, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 385 + }, + { + "completion_length": 1800.5, + "epoch": 0.05884146341463414, + "grad_norm": 0.1517348897658256, + "kl": 0.04290771484375, + "learning_rate": 1.7652439024390242e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 386 + }, + { + "completion_length": 1091.8333740234375, + "epoch": 0.05899390243902439, + "grad_norm": 0.09299176132272972, + "kl": 0.02545166015625, + "learning_rate": 1.7698170731707317e-06, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 387 + }, + { + "completion_length": 1132.8333435058594, + "epoch": 0.05914634146341463, + "grad_norm": 0.9424336242950686, + "kl": 0.060791015625, + "learning_rate": 1.774390243902439e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 388 + }, + { + "completion_length": 1505.3333435058594, + "epoch": 0.059298780487804875, + "grad_norm": 0.0865917752375575, + "kl": 0.029296875, + "learning_rate": 1.7789634146341466e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 389 + }, + { + "completion_length": 824.0000305175781, + "epoch": 0.05945121951219512, + "grad_norm": 0.16882661840093957, + "kl": 0.03271484375, + "learning_rate": 1.7835365853658536e-06, + "loss": 0.0013, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 390 + }, + { + "completion_length": 915.8333435058594, + "epoch": 0.05960365853658536, + "grad_norm": 0.16001660994941236, + "kl": 0.03924560546875, + "learning_rate": 1.7881097560975612e-06, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 391 + }, + { + "completion_length": 1317.8333740234375, + "epoch": 0.05975609756097561, + "grad_norm": 0.10880853112277074, + "kl": 0.03662109375, + "learning_rate": 1.7926829268292682e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 392 + }, + { + "completion_length": 2201.5001220703125, + "epoch": 0.05990853658536585, + "grad_norm": 0.07889061756091413, + "kl": 0.0318603515625, + "learning_rate": 1.7972560975609758e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 393 + }, + { + "completion_length": 1201.5000457763672, + "epoch": 0.060060975609756095, + "grad_norm": 3.0113334610157954, + "kl": 0.0582275390625, + "learning_rate": 1.8018292682926828e-06, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 394 + }, + { + "completion_length": 1229.8333435058594, + "epoch": 0.06021341463414634, + "grad_norm": 0.09631400513507873, + "kl": 0.03570556640625, + "learning_rate": 1.8064024390243903e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 395 + }, + { + "completion_length": 930.8333435058594, + "epoch": 0.060365853658536583, + "grad_norm": 0.11469153316750146, + "kl": 0.0362548828125, + "learning_rate": 1.8109756097560976e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 396 + }, + { + "completion_length": 1432.6666870117188, + "epoch": 0.06051829268292683, + "grad_norm": 0.18758252887548696, + "kl": 0.045166015625, + "learning_rate": 1.815548780487805e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 397 + }, + { + "completion_length": 1009.6666870117188, + "epoch": 0.06067073170731707, + "grad_norm": 2.026275047134112, + "kl": 0.0450439453125, + "learning_rate": 1.8201219512195122e-06, + "loss": 0.0018, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 398 + }, + { + "completion_length": 1643.5000610351562, + "epoch": 0.060823170731707316, + "grad_norm": 0.12238848768969658, + "kl": 0.03515625, + "learning_rate": 1.8246951219512198e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 399 + }, + { + "completion_length": 597.6666870117188, + "epoch": 0.06097560975609756, + "grad_norm": 0.5970394082106288, + "kl": 0.0528564453125, + "learning_rate": 1.8292682926829268e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 400 + }, + { + "completion_length": 1801.1666870117188, + "epoch": 0.061128048780487804, + "grad_norm": 0.12379395760789767, + "kl": 0.040771484375, + "learning_rate": 1.8338414634146344e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 401 + }, + { + "completion_length": 894.0000305175781, + "epoch": 0.06128048780487805, + "grad_norm": 0.110241993607584, + "kl": 0.03204345703125, + "learning_rate": 1.8384146341463414e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 402 + }, + { + "completion_length": 1661.3333740234375, + "epoch": 0.06143292682926829, + "grad_norm": 0.09452163008440835, + "kl": 0.0400390625, + "learning_rate": 1.842987804878049e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 403 + }, + { + "completion_length": 660.8333435058594, + "epoch": 0.061585365853658536, + "grad_norm": 0.1242067677459992, + "kl": 0.025146484375, + "learning_rate": 1.847560975609756e-06, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 404 + }, + { + "completion_length": 962.6667175292969, + "epoch": 0.06173780487804878, + "grad_norm": 2.4173749779690366, + "kl": 0.046875, + "learning_rate": 1.8521341463414636e-06, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 405 + }, + { + "completion_length": 1102.0000610351562, + "epoch": 0.061890243902439024, + "grad_norm": 0.07900785600566818, + "kl": 0.03131103515625, + "learning_rate": 1.8567073170731709e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 406 + }, + { + "completion_length": 828.3333435058594, + "epoch": 0.06204268292682927, + "grad_norm": 0.14169584297923318, + "kl": 0.03515625, + "learning_rate": 1.8612804878048782e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 407 + }, + { + "completion_length": 2351.5000610351562, + "epoch": 0.06219512195121951, + "grad_norm": 0.13949131510881424, + "kl": 0.0516357421875, + "learning_rate": 1.8658536585365854e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 408 + }, + { + "completion_length": 1282.8333740234375, + "epoch": 0.062347560975609756, + "grad_norm": 1.7234871151609719, + "kl": 0.03985595703125, + "learning_rate": 1.870426829268293e-06, + "loss": 0.0016, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 409 + }, + { + "completion_length": 1188.0000305175781, + "epoch": 0.0625, + "grad_norm": 0.0917662464778782, + "kl": 0.03363037109375, + "learning_rate": 1.875e-06, + "loss": 0.0013, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 410 + }, + { + "completion_length": 1204.0000305175781, + "epoch": 0.06265243902439024, + "grad_norm": 0.23537240369303108, + "kl": 0.0484619140625, + "learning_rate": 1.8795731707317071e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 411 + }, + { + "completion_length": 1010.3333740234375, + "epoch": 0.06280487804878049, + "grad_norm": 0.16246773823360383, + "kl": 0.0443115234375, + "learning_rate": 1.8841463414634146e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 412 + }, + { + "completion_length": 1226.666748046875, + "epoch": 0.06295731707317073, + "grad_norm": 1.9296695385616638, + "kl": 0.03765869140625, + "learning_rate": 1.888719512195122e-06, + "loss": 0.0015, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 413 + }, + { + "completion_length": 819.8333740234375, + "epoch": 0.06310975609756098, + "grad_norm": 2.1444920261280367, + "kl": 0.0428466796875, + "learning_rate": 1.8932926829268292e-06, + "loss": 0.0017, + "reward": 0.8333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/format_reward": 0.0, + "step": 414 + }, + { + "completion_length": 1308.6666870117188, + "epoch": 0.06326219512195122, + "grad_norm": 0.33210496273974505, + "kl": 0.0557861328125, + "learning_rate": 1.8978658536585365e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 415 + }, + { + "completion_length": 873.6666870117188, + "epoch": 0.06341463414634146, + "grad_norm": 0.08210316450568635, + "kl": 0.033447265625, + "learning_rate": 1.902439024390244e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 416 + }, + { + "completion_length": 915.1667175292969, + "epoch": 0.06356707317073171, + "grad_norm": 3.303155260495599, + "kl": 0.03533935546875, + "learning_rate": 1.9070121951219511e-06, + "loss": 0.0014, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 417 + }, + { + "completion_length": 893.8333740234375, + "epoch": 0.06371951219512195, + "grad_norm": 0.17854463639787457, + "kl": 0.02825927734375, + "learning_rate": 1.9115853658536587e-06, + "loss": 0.0011, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 418 + }, + { + "completion_length": 613.0000305175781, + "epoch": 0.0638719512195122, + "grad_norm": 0.17370249913654695, + "kl": 0.04925537109375, + "learning_rate": 1.9161585365853657e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 419 + }, + { + "completion_length": 651.3333435058594, + "epoch": 0.06402439024390244, + "grad_norm": 0.12733041069395715, + "kl": 0.0433349609375, + "learning_rate": 1.9207317073170733e-06, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 420 + }, + { + "completion_length": 526.3333435058594, + "epoch": 0.06417682926829268, + "grad_norm": 0.8611430876718892, + "kl": 0.093505859375, + "learning_rate": 1.9253048780487803e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 421 + }, + { + "completion_length": 682.5, + "epoch": 0.06432926829268293, + "grad_norm": 0.11926823124665795, + "kl": 0.0447998046875, + "learning_rate": 1.929878048780488e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 422 + }, + { + "completion_length": 989.3333740234375, + "epoch": 0.06448170731707317, + "grad_norm": 0.18858582365556342, + "kl": 0.03167724609375, + "learning_rate": 1.934451219512195e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 423 + }, + { + "completion_length": 654.0000152587891, + "epoch": 0.06463414634146342, + "grad_norm": 0.1455352883078742, + "kl": 0.044677734375, + "learning_rate": 1.9390243902439024e-06, + "loss": 0.0018, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 424 + }, + { + "completion_length": 856.8333740234375, + "epoch": 0.06478658536585366, + "grad_norm": 2.1506825643159457, + "kl": 0.0601806640625, + "learning_rate": 1.9435975609756095e-06, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 425 + }, + { + "completion_length": 958.1666717529297, + "epoch": 0.0649390243902439, + "grad_norm": 3.6702483020684653, + "kl": 0.05810546875, + "learning_rate": 1.948170731707317e-06, + "loss": 0.0023, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 426 + }, + { + "completion_length": 1074.0000305175781, + "epoch": 0.06509146341463415, + "grad_norm": 0.10541325558160027, + "kl": 0.042236328125, + "learning_rate": 1.952743902439024e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 427 + }, + { + "completion_length": 639.0000152587891, + "epoch": 0.06524390243902439, + "grad_norm": 2.984910998440791, + "kl": 0.0538330078125, + "learning_rate": 1.9573170731707316e-06, + "loss": 0.0022, + "reward": 0.8333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/format_reward": 0.0, + "step": 428 + }, + { + "completion_length": 822.1666870117188, + "epoch": 0.06539634146341464, + "grad_norm": 2.1209347762549027, + "kl": 0.0584716796875, + "learning_rate": 1.961890243902439e-06, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 429 + }, + { + "completion_length": 696.6666870117188, + "epoch": 0.06554878048780488, + "grad_norm": 0.18785997424209896, + "kl": 0.051513671875, + "learning_rate": 1.9664634146341467e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 430 + }, + { + "completion_length": 860.5, + "epoch": 0.06570121951219512, + "grad_norm": 3.424538463010186, + "kl": 0.0595703125, + "learning_rate": 1.9710365853658538e-06, + "loss": 0.0024, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 431 + }, + { + "completion_length": 729.5000305175781, + "epoch": 0.06585365853658537, + "grad_norm": 0.16524885101944298, + "kl": 0.069091796875, + "learning_rate": 1.9756097560975613e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 432 + }, + { + "completion_length": 1115.1666870117188, + "epoch": 0.06600609756097561, + "grad_norm": 0.13660841828834167, + "kl": 0.0460205078125, + "learning_rate": 1.9801829268292684e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 433 + }, + { + "completion_length": 978.1666870117188, + "epoch": 0.06615853658536586, + "grad_norm": 0.15567199337081067, + "kl": 0.055419921875, + "learning_rate": 1.984756097560976e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 434 + }, + { + "completion_length": 1279.5000610351562, + "epoch": 0.0663109756097561, + "grad_norm": 0.07807433179977072, + "kl": 0.03204345703125, + "learning_rate": 1.989329268292683e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 435 + }, + { + "completion_length": 637.5, + "epoch": 0.06646341463414634, + "grad_norm": 0.1596977878498081, + "kl": 0.0633544921875, + "learning_rate": 1.9939024390243905e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 436 + }, + { + "completion_length": 827.0, + "epoch": 0.06661585365853659, + "grad_norm": 0.15683032760546614, + "kl": 0.0577392578125, + "learning_rate": 1.9984756097560975e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 437 + }, + { + "completion_length": 1034.3333435058594, + "epoch": 0.06676829268292683, + "grad_norm": 0.12436321266422015, + "kl": 0.03662109375, + "learning_rate": 2.003048780487805e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 438 + }, + { + "completion_length": 1344.3333740234375, + "epoch": 0.06692073170731708, + "grad_norm": 1.9221422881336732, + "kl": 0.04022216796875, + "learning_rate": 2.007621951219512e-06, + "loss": 0.0016, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 439 + }, + { + "completion_length": 1017.1666870117188, + "epoch": 0.06707317073170732, + "grad_norm": 2.1294803122862604, + "kl": 0.0557861328125, + "learning_rate": 2.0121951219512197e-06, + "loss": 0.0022, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 440 + }, + { + "completion_length": 1431.8333740234375, + "epoch": 0.06722560975609757, + "grad_norm": 1.512664709932275, + "kl": 0.074462890625, + "learning_rate": 2.0167682926829267e-06, + "loss": 0.003, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 441 + }, + { + "completion_length": 925.5000305175781, + "epoch": 0.06737804878048781, + "grad_norm": 0.18271249845280813, + "kl": 0.050537109375, + "learning_rate": 2.0213414634146343e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 442 + }, + { + "completion_length": 1145.0000610351562, + "epoch": 0.06753048780487805, + "grad_norm": 1.9539157512887104, + "kl": 0.0709228515625, + "learning_rate": 2.0259146341463413e-06, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 443 + }, + { + "completion_length": 1100.8333740234375, + "epoch": 0.0676829268292683, + "grad_norm": 0.197098654297166, + "kl": 0.055419921875, + "learning_rate": 2.030487804878049e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 444 + }, + { + "completion_length": 977.6666870117188, + "epoch": 0.06783536585365854, + "grad_norm": 1.7878867980620121, + "kl": 0.0950927734375, + "learning_rate": 2.035060975609756e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 445 + }, + { + "completion_length": 1023.8333435058594, + "epoch": 0.06798780487804879, + "grad_norm": 0.12639833470759998, + "kl": 0.0570068359375, + "learning_rate": 2.0396341463414635e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 446 + }, + { + "completion_length": 1117.6666870117188, + "epoch": 0.06814024390243903, + "grad_norm": 0.18978681208972265, + "kl": 0.0751953125, + "learning_rate": 2.044207317073171e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 447 + }, + { + "completion_length": 702.5000305175781, + "epoch": 0.06829268292682927, + "grad_norm": 2.99618758438965, + "kl": 0.0887451171875, + "learning_rate": 2.048780487804878e-06, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 448 + }, + { + "completion_length": 1096.0000610351562, + "epoch": 0.06844512195121952, + "grad_norm": 0.1977123890366534, + "kl": 0.08203125, + "learning_rate": 2.0533536585365856e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 449 + }, + { + "completion_length": 1158.8333740234375, + "epoch": 0.06859756097560976, + "grad_norm": 2.450624079426144, + "kl": 0.067626953125, + "learning_rate": 2.057926829268293e-06, + "loss": 0.0027, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 450 + }, + { + "completion_length": 2308.5000610351562, + "epoch": 0.06875, + "grad_norm": 0.135482284656565, + "kl": 0.070556640625, + "learning_rate": 2.0625e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 451 + }, + { + "completion_length": 1642.3333435058594, + "epoch": 0.06890243902439025, + "grad_norm": 0.19147778724962733, + "kl": 0.060302734375, + "learning_rate": 2.0670731707317072e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 452 + }, + { + "completion_length": 1156.3333740234375, + "epoch": 0.0690548780487805, + "grad_norm": 0.2106284097202651, + "kl": 0.0535888671875, + "learning_rate": 2.0716463414634148e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 453 + }, + { + "completion_length": 846.8333740234375, + "epoch": 0.06920731707317074, + "grad_norm": 0.14459069570457622, + "kl": 0.048583984375, + "learning_rate": 2.076219512195122e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 454 + }, + { + "completion_length": 1926.8333740234375, + "epoch": 0.06935975609756098, + "grad_norm": 0.08866767674701397, + "kl": 0.043212890625, + "learning_rate": 2.0807926829268294e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 455 + }, + { + "completion_length": 2215.8333740234375, + "epoch": 0.06951219512195123, + "grad_norm": 0.1024875555702485, + "kl": 0.048583984375, + "learning_rate": 2.0853658536585364e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 456 + }, + { + "completion_length": 1041.1666870117188, + "epoch": 0.06966463414634147, + "grad_norm": 0.19050624768480884, + "kl": 0.0574951171875, + "learning_rate": 2.089939024390244e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 457 + }, + { + "completion_length": 2510.6666870117188, + "epoch": 0.06981707317073171, + "grad_norm": 0.18038675912381863, + "kl": 0.052978515625, + "learning_rate": 2.094512195121951e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 458 + }, + { + "completion_length": 865.6666870117188, + "epoch": 0.06996951219512196, + "grad_norm": 0.12081159902459203, + "kl": 0.0487060546875, + "learning_rate": 2.0990853658536586e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 459 + }, + { + "completion_length": 862.3333435058594, + "epoch": 0.0701219512195122, + "grad_norm": 0.14923109827823103, + "kl": 0.0455322265625, + "learning_rate": 2.1036585365853656e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 460 + }, + { + "completion_length": 2182.5001220703125, + "epoch": 0.07027439024390245, + "grad_norm": 0.08796935257811542, + "kl": 0.0439453125, + "learning_rate": 2.108231707317073e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 461 + }, + { + "completion_length": 2733.5001220703125, + "epoch": 0.07042682926829268, + "grad_norm": 0.08168662756433238, + "kl": 0.0516357421875, + "learning_rate": 2.1128048780487802e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 462 + }, + { + "completion_length": 2326.5001220703125, + "epoch": 0.07057926829268292, + "grad_norm": 2.201430701090042, + "kl": 0.0517578125, + "learning_rate": 2.1173780487804877e-06, + "loss": 0.0021, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 463 + }, + { + "completion_length": 1102.1666717529297, + "epoch": 0.07073170731707316, + "grad_norm": 2.1038548252903664, + "kl": 0.066162109375, + "learning_rate": 2.1219512195121953e-06, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 464 + }, + { + "completion_length": 1713.666748046875, + "epoch": 0.07088414634146341, + "grad_norm": 2.172394221914499, + "kl": 0.048583984375, + "learning_rate": 2.1265243902439023e-06, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 465 + }, + { + "completion_length": 3308.8333740234375, + "epoch": 0.07103658536585365, + "grad_norm": 0.06146139191214292, + "kl": 0.036376953125, + "learning_rate": 2.13109756097561e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 466 + }, + { + "completion_length": 1681.0000610351562, + "epoch": 0.0711890243902439, + "grad_norm": 1.1135609810346263, + "kl": 0.0511474609375, + "learning_rate": 2.1356707317073174e-06, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 467 + }, + { + "completion_length": 1818.1666870117188, + "epoch": 0.07134146341463414, + "grad_norm": 0.16672051447092315, + "kl": 0.062744140625, + "learning_rate": 2.1402439024390245e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 468 + }, + { + "completion_length": 1655.0, + "epoch": 0.07149390243902438, + "grad_norm": 0.38317849623189654, + "kl": 0.079345703125, + "learning_rate": 2.144817073170732e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 469 + }, + { + "completion_length": 2033.5000610351562, + "epoch": 0.07164634146341463, + "grad_norm": 1.0278885566163398, + "kl": 0.05224609375, + "learning_rate": 2.149390243902439e-06, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 470 + }, + { + "completion_length": 1281.3333740234375, + "epoch": 0.07179878048780487, + "grad_norm": 0.15159997480952261, + "kl": 0.05126953125, + "learning_rate": 2.1539634146341466e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 471 + }, + { + "completion_length": 530.0, + "epoch": 0.07195121951219512, + "grad_norm": 2.838695144332423, + "kl": 0.0689697265625, + "learning_rate": 2.1585365853658537e-06, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 472 + }, + { + "completion_length": 1290.8333740234375, + "epoch": 0.07210365853658536, + "grad_norm": 0.1521321556435626, + "kl": 0.092041015625, + "learning_rate": 2.163109756097561e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 473 + }, + { + "completion_length": 1049.0000305175781, + "epoch": 0.0722560975609756, + "grad_norm": 0.10692409215786541, + "kl": 0.0413818359375, + "learning_rate": 2.1676829268292682e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 474 + }, + { + "completion_length": 2895.83349609375, + "epoch": 0.07240853658536585, + "grad_norm": 0.10695180688675468, + "kl": 0.0557861328125, + "learning_rate": 2.1722560975609758e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 475 + }, + { + "completion_length": 1055.6666870117188, + "epoch": 0.07256097560975609, + "grad_norm": 0.09706643595953211, + "kl": 0.03076171875, + "learning_rate": 2.176829268292683e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 476 + }, + { + "completion_length": 1677.8333740234375, + "epoch": 0.07271341463414634, + "grad_norm": 1.488610313761768, + "kl": 0.0650634765625, + "learning_rate": 2.1814024390243904e-06, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 477 + }, + { + "completion_length": 1356.5000305175781, + "epoch": 0.07286585365853658, + "grad_norm": 0.11639625155454351, + "kl": 0.051513671875, + "learning_rate": 2.1859756097560974e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 478 + }, + { + "completion_length": 1273.1666870117188, + "epoch": 0.07301829268292682, + "grad_norm": 2.382288761960426, + "kl": 0.079345703125, + "learning_rate": 2.190548780487805e-06, + "loss": 0.0032, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 479 + }, + { + "completion_length": 1838.0000610351562, + "epoch": 0.07317073170731707, + "grad_norm": 0.14424242455019304, + "kl": 0.0775146484375, + "learning_rate": 2.195121951219512e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 480 + }, + { + "completion_length": 2041.3333740234375, + "epoch": 0.07332317073170731, + "grad_norm": 0.10574225429935986, + "kl": 0.049072265625, + "learning_rate": 2.1996951219512196e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 481 + }, + { + "completion_length": 1515.8333740234375, + "epoch": 0.07347560975609756, + "grad_norm": 1.4304935985126426, + "kl": 0.0496826171875, + "learning_rate": 2.2042682926829266e-06, + "loss": 0.002, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 482 + }, + { + "completion_length": 2021.3333740234375, + "epoch": 0.0736280487804878, + "grad_norm": 0.09806977188325264, + "kl": 0.070068359375, + "learning_rate": 2.208841463414634e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 483 + }, + { + "completion_length": 1002.0000305175781, + "epoch": 0.07378048780487804, + "grad_norm": 0.12749901174993955, + "kl": 0.06689453125, + "learning_rate": 2.2134146341463417e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 484 + }, + { + "completion_length": 2747.166748046875, + "epoch": 0.07393292682926829, + "grad_norm": 0.21818236725370294, + "kl": 0.06494140625, + "learning_rate": 2.217987804878049e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 485 + }, + { + "completion_length": 1849.0000610351562, + "epoch": 0.07408536585365853, + "grad_norm": 1.1915258293669206, + "kl": 0.061279296875, + "learning_rate": 2.2225609756097563e-06, + "loss": 0.0024, + "reward": 0.8333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/format_reward": 0.0, + "step": 486 + }, + { + "completion_length": 1082.5000610351562, + "epoch": 0.07423780487804878, + "grad_norm": 0.14701822588479485, + "kl": 0.0439453125, + "learning_rate": 2.2271341463414638e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 487 + }, + { + "completion_length": 2887.8333740234375, + "epoch": 0.07439024390243902, + "grad_norm": 0.12026909422526706, + "kl": 0.06640625, + "learning_rate": 2.231707317073171e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 488 + }, + { + "completion_length": 1490.0, + "epoch": 0.07454268292682927, + "grad_norm": 0.13202104980647014, + "kl": 0.0615234375, + "learning_rate": 2.2362804878048784e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 489 + }, + { + "completion_length": 2024.166748046875, + "epoch": 0.07469512195121951, + "grad_norm": 0.09861703769319381, + "kl": 0.072021484375, + "learning_rate": 2.2408536585365855e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 490 + }, + { + "completion_length": 1535.8333740234375, + "epoch": 0.07484756097560975, + "grad_norm": 2.0173412647378153, + "kl": 0.08349609375, + "learning_rate": 2.245426829268293e-06, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 491 + }, + { + "completion_length": 1527.666748046875, + "epoch": 0.075, + "grad_norm": 1.7334590471841578, + "kl": 0.073974609375, + "learning_rate": 2.25e-06, + "loss": 0.003, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 492 + }, + { + "completion_length": 2560.8334350585938, + "epoch": 0.07515243902439024, + "grad_norm": 0.8033252141662366, + "kl": 0.0518798828125, + "learning_rate": 2.254573170731707e-06, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 493 + }, + { + "completion_length": 2030.666748046875, + "epoch": 0.07530487804878049, + "grad_norm": 2.490855997344205, + "kl": 0.08154296875, + "learning_rate": 2.2591463414634147e-06, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 494 + }, + { + "completion_length": 3134.5, + "epoch": 0.07545731707317073, + "grad_norm": 0.10104326610560348, + "kl": 0.0654296875, + "learning_rate": 2.2637195121951217e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 495 + }, + { + "completion_length": 1479.6666870117188, + "epoch": 0.07560975609756097, + "grad_norm": 0.11675594206142026, + "kl": 0.0401611328125, + "learning_rate": 2.2682926829268293e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 496 + }, + { + "completion_length": 1932.8333435058594, + "epoch": 0.07576219512195122, + "grad_norm": 0.09260865192700916, + "kl": 0.0550537109375, + "learning_rate": 2.2728658536585363e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 497 + }, + { + "completion_length": 2960.3333740234375, + "epoch": 0.07591463414634146, + "grad_norm": 0.09926284350834852, + "kl": 0.0472412109375, + "learning_rate": 2.277439024390244e-06, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 498 + }, + { + "completion_length": 2877.5, + "epoch": 0.0760670731707317, + "grad_norm": 0.1064194265650492, + "kl": 0.0460205078125, + "learning_rate": 2.282012195121951e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 499 + }, + { + "completion_length": 2457.8333435058594, + "epoch": 0.07621951219512195, + "grad_norm": 0.10472663374481526, + "kl": 0.04248046875, + "learning_rate": 2.2865853658536584e-06, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 500 + }, + { + "completion_length": 1154.8333740234375, + "epoch": 0.0763719512195122, + "grad_norm": 2.4026313181946675, + "kl": 0.0771484375, + "learning_rate": 2.291158536585366e-06, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 501 + }, + { + "completion_length": 4096.0, + "epoch": 0.07652439024390244, + "grad_norm": 0.06324492817517988, + "kl": 0.0509033203125, + "learning_rate": 2.2957317073170735e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 502 + }, + { + "completion_length": 2956.8333740234375, + "epoch": 0.07667682926829268, + "grad_norm": 0.08390146088883162, + "kl": 0.0517578125, + "learning_rate": 2.3003048780487806e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 503 + }, + { + "completion_length": 3888.3333740234375, + "epoch": 0.07682926829268293, + "grad_norm": 0.05136451134834925, + "kl": 0.046630859375, + "learning_rate": 2.304878048780488e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 504 + }, + { + "completion_length": 1368.1666870117188, + "epoch": 0.07698170731707317, + "grad_norm": 0.0957506251578167, + "kl": 0.0438232421875, + "learning_rate": 2.309451219512195e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 505 + }, + { + "completion_length": 3742.33349609375, + "epoch": 0.07713414634146341, + "grad_norm": 0.06695141778505624, + "kl": 0.0592041015625, + "learning_rate": 2.3140243902439027e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 506 + }, + { + "completion_length": 2283.5, + "epoch": 0.07728658536585366, + "grad_norm": 0.1696931172147838, + "kl": 0.0635986328125, + "learning_rate": 2.3185975609756098e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 507 + }, + { + "completion_length": 2656.3333740234375, + "epoch": 0.0774390243902439, + "grad_norm": 0.0929607998561816, + "kl": 0.0455322265625, + "learning_rate": 2.3231707317073173e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 508 + }, + { + "completion_length": 2847.1666870117188, + "epoch": 0.07759146341463415, + "grad_norm": 0.06411193099066734, + "kl": 0.0360107421875, + "learning_rate": 2.3277439024390244e-06, + "loss": 0.0014, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 509 + }, + { + "completion_length": 3735.166748046875, + "epoch": 0.07774390243902439, + "grad_norm": 0.11208604035781976, + "kl": 0.0596923828125, + "learning_rate": 2.332317073170732e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 510 + }, + { + "completion_length": 2672.3333740234375, + "epoch": 0.07789634146341463, + "grad_norm": 0.1088452906181289, + "kl": 0.0443115234375, + "learning_rate": 2.336890243902439e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 511 + }, + { + "completion_length": 3423.666748046875, + "epoch": 0.07804878048780488, + "grad_norm": 0.06479698444234247, + "kl": 0.055419921875, + "learning_rate": 2.3414634146341465e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 512 + }, + { + "completion_length": 2900.5001220703125, + "epoch": 0.07820121951219512, + "grad_norm": 0.09967857645015946, + "kl": 0.072265625, + "learning_rate": 2.3460365853658536e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 513 + }, + { + "completion_length": 2856.5, + "epoch": 0.07835365853658537, + "grad_norm": 0.0969916532036844, + "kl": 0.04541015625, + "learning_rate": 2.350609756097561e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 514 + }, + { + "completion_length": 2345.3333740234375, + "epoch": 0.07850609756097561, + "grad_norm": 0.19083929386239637, + "kl": 0.065673828125, + "learning_rate": 2.355182926829268e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 515 + }, + { + "completion_length": 2859.3333740234375, + "epoch": 0.07865853658536585, + "grad_norm": 0.09488935657658776, + "kl": 0.0589599609375, + "learning_rate": 2.3597560975609757e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 516 + }, + { + "completion_length": 1077.6666870117188, + "epoch": 0.0788109756097561, + "grad_norm": 0.3434553556455728, + "kl": 0.040771484375, + "learning_rate": 2.3643292682926827e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 517 + }, + { + "completion_length": 944.6666870117188, + "epoch": 0.07896341463414634, + "grad_norm": 2.947615722412888, + "kl": 0.0567626953125, + "learning_rate": 2.3689024390243903e-06, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 518 + }, + { + "completion_length": 2574.6666870117188, + "epoch": 0.07911585365853659, + "grad_norm": 0.07856453622987541, + "kl": 0.04833984375, + "learning_rate": 2.3734756097560978e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 519 + }, + { + "completion_length": 3653.166748046875, + "epoch": 0.07926829268292683, + "grad_norm": 0.05075246047036239, + "kl": 0.0496826171875, + "learning_rate": 2.378048780487805e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 520 + }, + { + "completion_length": 1989.8334350585938, + "epoch": 0.07942073170731707, + "grad_norm": 1.4899777547178306, + "kl": 0.0643310546875, + "learning_rate": 2.3826219512195124e-06, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 521 + }, + { + "completion_length": 1975.666748046875, + "epoch": 0.07957317073170732, + "grad_norm": 2.1361109322583283, + "kl": 0.0682373046875, + "learning_rate": 2.38719512195122e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 522 + }, + { + "completion_length": 1512.6666870117188, + "epoch": 0.07972560975609756, + "grad_norm": 1.0498208124824244, + "kl": 0.06396484375, + "learning_rate": 2.391768292682927e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 523 + }, + { + "completion_length": 2129.1666870117188, + "epoch": 0.0798780487804878, + "grad_norm": 0.12047353047642363, + "kl": 0.0533447265625, + "learning_rate": 2.3963414634146345e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 524 + }, + { + "completion_length": 3090.5, + "epoch": 0.08003048780487805, + "grad_norm": 0.5762523834286665, + "kl": 0.0599365234375, + "learning_rate": 2.4009146341463416e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 525 + }, + { + "completion_length": 2568.3333740234375, + "epoch": 0.0801829268292683, + "grad_norm": 0.11703662962979616, + "kl": 0.0489501953125, + "learning_rate": 2.405487804878049e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 526 + }, + { + "completion_length": 3359.166748046875, + "epoch": 0.08033536585365854, + "grad_norm": 0.05884458284631813, + "kl": 0.03857421875, + "learning_rate": 2.410060975609756e-06, + "loss": 0.0015, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 527 + }, + { + "completion_length": 3362.0001220703125, + "epoch": 0.08048780487804878, + "grad_norm": 0.05591476179838259, + "kl": 0.040771484375, + "learning_rate": 2.4146341463414637e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 528 + }, + { + "completion_length": 3104.5, + "epoch": 0.08064024390243903, + "grad_norm": 0.3519263806573645, + "kl": 0.0703125, + "learning_rate": 2.4192073170731708e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 529 + }, + { + "completion_length": 3306.3333740234375, + "epoch": 0.08079268292682927, + "grad_norm": 0.06281461484097053, + "kl": 0.0416259765625, + "learning_rate": 2.4237804878048783e-06, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 530 + }, + { + "completion_length": 1439.8333740234375, + "epoch": 0.08094512195121951, + "grad_norm": 0.13512806991612536, + "kl": 0.0653076171875, + "learning_rate": 2.4283536585365854e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 531 + }, + { + "completion_length": 3042.8333740234375, + "epoch": 0.08109756097560976, + "grad_norm": 0.07187844616050446, + "kl": 0.049560546875, + "learning_rate": 2.432926829268293e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 532 + }, + { + "completion_length": 3466.8333740234375, + "epoch": 0.08125, + "grad_norm": 0.48379619256427026, + "kl": 0.056640625, + "learning_rate": 2.4375e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 533 + }, + { + "completion_length": 3413.0, + "epoch": 0.08140243902439025, + "grad_norm": 0.0463593577377373, + "kl": 0.0423583984375, + "learning_rate": 2.442073170731707e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 534 + }, + { + "completion_length": 1990.166748046875, + "epoch": 0.08155487804878049, + "grad_norm": 0.15525711494622366, + "kl": 0.07080078125, + "learning_rate": 2.4466463414634146e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 535 + }, + { + "completion_length": 3459.5001220703125, + "epoch": 0.08170731707317073, + "grad_norm": 0.05976664538756544, + "kl": 0.0426025390625, + "learning_rate": 2.451219512195122e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 536 + }, + { + "completion_length": 2554.6666870117188, + "epoch": 0.08185975609756098, + "grad_norm": 0.10120073903491479, + "kl": 0.0430908203125, + "learning_rate": 2.455792682926829e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 537 + }, + { + "completion_length": 1069.6666870117188, + "epoch": 0.08201219512195122, + "grad_norm": 0.15927565450120426, + "kl": 0.0567626953125, + "learning_rate": 2.4603658536585367e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 538 + }, + { + "completion_length": 1985.666748046875, + "epoch": 0.08216463414634147, + "grad_norm": 0.11102370705231226, + "kl": 0.04931640625, + "learning_rate": 2.464939024390244e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 539 + }, + { + "completion_length": 1381.0000305175781, + "epoch": 0.08231707317073171, + "grad_norm": 0.1266698978334977, + "kl": 0.046142578125, + "learning_rate": 2.4695121951219513e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 540 + }, + { + "completion_length": 2373.5000915527344, + "epoch": 0.08246951219512196, + "grad_norm": 0.1057689039627977, + "kl": 0.0384521484375, + "learning_rate": 2.4740853658536588e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 541 + }, + { + "completion_length": 1418.666748046875, + "epoch": 0.0826219512195122, + "grad_norm": 0.11838847787326476, + "kl": 0.0498046875, + "learning_rate": 2.478658536585366e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 542 + }, + { + "completion_length": 1630.3333740234375, + "epoch": 0.08277439024390244, + "grad_norm": 0.3256377252476011, + "kl": 0.0732421875, + "learning_rate": 2.4832317073170734e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 543 + }, + { + "completion_length": 1692.666748046875, + "epoch": 0.08292682926829269, + "grad_norm": 0.44892478902326494, + "kl": 0.0670166015625, + "learning_rate": 2.4878048780487805e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 544 + }, + { + "completion_length": 1582.3333740234375, + "epoch": 0.08307926829268293, + "grad_norm": 1.7091902518095317, + "kl": 0.0838623046875, + "learning_rate": 2.492378048780488e-06, + "loss": 0.0034, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 545 + }, + { + "completion_length": 1772.5, + "epoch": 0.08323170731707318, + "grad_norm": 0.13635729918429143, + "kl": 0.072998046875, + "learning_rate": 2.496951219512195e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 546 + }, + { + "completion_length": 2131.8333740234375, + "epoch": 0.08338414634146342, + "grad_norm": 1.7730215571395036, + "kl": 0.065673828125, + "learning_rate": 2.5015243902439026e-06, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 547 + }, + { + "completion_length": 1654.6666870117188, + "epoch": 0.08353658536585366, + "grad_norm": 0.1891104421798239, + "kl": 0.08447265625, + "learning_rate": 2.5060975609756097e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 548 + }, + { + "completion_length": 3544.666748046875, + "epoch": 0.08368902439024391, + "grad_norm": 0.08440326134872025, + "kl": 0.0587158203125, + "learning_rate": 2.510670731707317e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 549 + }, + { + "completion_length": 565.1666870117188, + "epoch": 0.08384146341463415, + "grad_norm": 0.38021512178208977, + "kl": 0.078125, + "learning_rate": 2.5152439024390243e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 550 + }, + { + "completion_length": 2400.5, + "epoch": 0.0839939024390244, + "grad_norm": 0.12046655222936543, + "kl": 0.0477294921875, + "learning_rate": 2.5198170731707318e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 551 + }, + { + "completion_length": 1937.3333740234375, + "epoch": 0.08414634146341464, + "grad_norm": 0.20182551090883163, + "kl": 0.0526123046875, + "learning_rate": 2.524390243902439e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 552 + }, + { + "completion_length": 3240.5001220703125, + "epoch": 0.08429878048780488, + "grad_norm": 0.08061727402577028, + "kl": 0.0521240234375, + "learning_rate": 2.5289634146341464e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 553 + }, + { + "completion_length": 2655.666748046875, + "epoch": 0.08445121951219513, + "grad_norm": 0.09273512902565882, + "kl": 0.0572509765625, + "learning_rate": 2.5335365853658534e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 554 + }, + { + "completion_length": 1486.1667175292969, + "epoch": 0.08460365853658537, + "grad_norm": 0.16126112186873598, + "kl": 0.0650634765625, + "learning_rate": 2.538109756097561e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 555 + }, + { + "completion_length": 1031.6667175292969, + "epoch": 0.08475609756097562, + "grad_norm": 0.1879724279258529, + "kl": 0.0460205078125, + "learning_rate": 2.5426829268292685e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 556 + }, + { + "completion_length": 1913.5000915527344, + "epoch": 0.08490853658536586, + "grad_norm": 1.7975884964987996, + "kl": 0.056884765625, + "learning_rate": 2.547256097560976e-06, + "loss": 0.0023, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 557 + }, + { + "completion_length": 792.1667175292969, + "epoch": 0.0850609756097561, + "grad_norm": 2.6952179267262437, + "kl": 0.0899658203125, + "learning_rate": 2.551829268292683e-06, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 558 + }, + { + "completion_length": 3099.83349609375, + "epoch": 0.08521341463414635, + "grad_norm": 0.10222482877404063, + "kl": 0.059326171875, + "learning_rate": 2.5564024390243906e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 559 + }, + { + "completion_length": 2305.666748046875, + "epoch": 0.08536585365853659, + "grad_norm": 3.329306321486602, + "kl": 0.0869140625, + "learning_rate": 2.5609756097560977e-06, + "loss": 0.0035, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 560 + }, + { + "completion_length": 2600.0, + "epoch": 0.08551829268292684, + "grad_norm": 0.17448662677453702, + "kl": 0.0736083984375, + "learning_rate": 2.565548780487805e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 561 + }, + { + "completion_length": 2855.5, + "epoch": 0.08567073170731708, + "grad_norm": 2.7022651895414582, + "kl": 0.068115234375, + "learning_rate": 2.5701219512195123e-06, + "loss": 0.0027, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 562 + }, + { + "completion_length": 1174.1666870117188, + "epoch": 0.08582317073170732, + "grad_norm": 0.14986397727752465, + "kl": 0.0638427734375, + "learning_rate": 2.5746951219512198e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 563 + }, + { + "completion_length": 1815.3333740234375, + "epoch": 0.08597560975609755, + "grad_norm": 0.09757232791997868, + "kl": 0.045166015625, + "learning_rate": 2.579268292682927e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 564 + }, + { + "completion_length": 2007.666748046875, + "epoch": 0.0861280487804878, + "grad_norm": 0.12074976219158004, + "kl": 0.0615234375, + "learning_rate": 2.5838414634146344e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 565 + }, + { + "completion_length": 1833.166748046875, + "epoch": 0.08628048780487804, + "grad_norm": 0.17143865322535476, + "kl": 0.098388671875, + "learning_rate": 2.5884146341463415e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 566 + }, + { + "completion_length": 1477.3333740234375, + "epoch": 0.08643292682926829, + "grad_norm": 0.10521879006731649, + "kl": 0.04559326171875, + "learning_rate": 2.592987804878049e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 567 + }, + { + "completion_length": 843.1666870117188, + "epoch": 0.08658536585365853, + "grad_norm": 0.2255182299677177, + "kl": 0.09228515625, + "learning_rate": 2.597560975609756e-06, + "loss": 0.0037, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 568 + }, + { + "completion_length": 830.5000610351562, + "epoch": 0.08673780487804877, + "grad_norm": 0.17799820909197583, + "kl": 0.0843505859375, + "learning_rate": 2.6021341463414636e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 569 + }, + { + "completion_length": 1249.3333740234375, + "epoch": 0.08689024390243902, + "grad_norm": 0.34597337300535946, + "kl": 0.082763671875, + "learning_rate": 2.6067073170731707e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 570 + }, + { + "completion_length": 2201.8333740234375, + "epoch": 0.08704268292682926, + "grad_norm": 0.28607679852389234, + "kl": 0.094970703125, + "learning_rate": 2.611280487804878e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 571 + }, + { + "completion_length": 1020.3333435058594, + "epoch": 0.0871951219512195, + "grad_norm": 0.3570082954353871, + "kl": 0.08056640625, + "learning_rate": 2.6158536585365853e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 572 + }, + { + "completion_length": 1400.6666870117188, + "epoch": 0.08734756097560975, + "grad_norm": 0.16280926138702134, + "kl": 0.06689453125, + "learning_rate": 2.6204268292682928e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 573 + }, + { + "completion_length": 742.8333587646484, + "epoch": 0.0875, + "grad_norm": 0.17031866368344598, + "kl": 0.092529296875, + "learning_rate": 2.6250000000000003e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 574 + }, + { + "completion_length": 2598.5001220703125, + "epoch": 0.08765243902439024, + "grad_norm": 0.10590022807805609, + "kl": 0.07470703125, + "learning_rate": 2.6295731707317074e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 575 + }, + { + "completion_length": 1200.1667175292969, + "epoch": 0.08780487804878048, + "grad_norm": 0.32949397065873043, + "kl": 0.0760498046875, + "learning_rate": 2.634146341463415e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 576 + }, + { + "completion_length": 883.6666870117188, + "epoch": 0.08795731707317073, + "grad_norm": 0.22064078561647785, + "kl": 0.1043701171875, + "learning_rate": 2.638719512195122e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 577 + }, + { + "completion_length": 1026.5000610351562, + "epoch": 0.08810975609756097, + "grad_norm": 0.10323323425258601, + "kl": 0.06689453125, + "learning_rate": 2.6432926829268295e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 578 + }, + { + "completion_length": 873.6666870117188, + "epoch": 0.08826219512195121, + "grad_norm": 0.15413924113051253, + "kl": 0.08349609375, + "learning_rate": 2.6478658536585366e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 579 + }, + { + "completion_length": 1340.3333740234375, + "epoch": 0.08841463414634146, + "grad_norm": 1.4875743424980332, + "kl": 0.07177734375, + "learning_rate": 2.652439024390244e-06, + "loss": 0.0029, + "reward": 0.8333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/format_reward": 0.0, + "step": 580 + }, + { + "completion_length": 1568.0, + "epoch": 0.0885670731707317, + "grad_norm": 2.220759755081775, + "kl": 0.0665283203125, + "learning_rate": 2.657012195121951e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 581 + }, + { + "completion_length": 1919.6666870117188, + "epoch": 0.08871951219512195, + "grad_norm": 0.09266125438750046, + "kl": 0.0557861328125, + "learning_rate": 2.6615853658536587e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 582 + }, + { + "completion_length": 2451.166748046875, + "epoch": 0.08887195121951219, + "grad_norm": 0.08793234825411492, + "kl": 0.0540771484375, + "learning_rate": 2.6661585365853658e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 583 + }, + { + "completion_length": 1519.666748046875, + "epoch": 0.08902439024390243, + "grad_norm": 0.10654831466097743, + "kl": 0.079345703125, + "learning_rate": 2.6707317073170733e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 584 + }, + { + "completion_length": 1799.5000610351562, + "epoch": 0.08917682926829268, + "grad_norm": 0.1039547101798147, + "kl": 0.066650390625, + "learning_rate": 2.6753048780487804e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 585 + }, + { + "completion_length": 2126.0000915527344, + "epoch": 0.08932926829268292, + "grad_norm": 0.14095740554479, + "kl": 0.060791015625, + "learning_rate": 2.679878048780488e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 586 + }, + { + "completion_length": 819.8333435058594, + "epoch": 0.08948170731707317, + "grad_norm": 0.1273026944259071, + "kl": 0.0609130859375, + "learning_rate": 2.684451219512195e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 587 + }, + { + "completion_length": 903.8333740234375, + "epoch": 0.08963414634146341, + "grad_norm": 1.9638913422159134, + "kl": 0.084716796875, + "learning_rate": 2.6890243902439025e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 588 + }, + { + "completion_length": 1291.6666870117188, + "epoch": 0.08978658536585366, + "grad_norm": 0.14166831005902422, + "kl": 0.072021484375, + "learning_rate": 2.6935975609756096e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 589 + }, + { + "completion_length": 1679.5001220703125, + "epoch": 0.0899390243902439, + "grad_norm": 0.11745877550925711, + "kl": 0.0643310546875, + "learning_rate": 2.698170731707317e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 590 + }, + { + "completion_length": 2757.0, + "epoch": 0.09009146341463414, + "grad_norm": 0.0855249628701506, + "kl": 0.0517578125, + "learning_rate": 2.702743902439024e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 591 + }, + { + "completion_length": 2074.5000610351562, + "epoch": 0.09024390243902439, + "grad_norm": 0.12052460571997022, + "kl": 0.0518798828125, + "learning_rate": 2.7073170731707317e-06, + "loss": 0.0021, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 592 + }, + { + "completion_length": 3867.8333740234375, + "epoch": 0.09039634146341463, + "grad_norm": 0.06821274396880904, + "kl": 0.04638671875, + "learning_rate": 2.711890243902439e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 593 + }, + { + "completion_length": 2641.0001220703125, + "epoch": 0.09054878048780488, + "grad_norm": 0.07483430464412127, + "kl": 0.05322265625, + "learning_rate": 2.7164634146341467e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 594 + }, + { + "completion_length": 3511.5, + "epoch": 0.09070121951219512, + "grad_norm": 0.07027545292665387, + "kl": 0.051025390625, + "learning_rate": 2.7210365853658538e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 595 + }, + { + "completion_length": 1675.5, + "epoch": 0.09085365853658536, + "grad_norm": 2.5767657615188155, + "kl": 0.0870361328125, + "learning_rate": 2.7256097560975613e-06, + "loss": 0.0035, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 596 + }, + { + "completion_length": 2394.0000610351562, + "epoch": 0.09100609756097561, + "grad_norm": 0.09685973487188176, + "kl": 0.0543212890625, + "learning_rate": 2.7301829268292684e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 597 + }, + { + "completion_length": 3525.666748046875, + "epoch": 0.09115853658536585, + "grad_norm": 0.7652375591071371, + "kl": 0.0499267578125, + "learning_rate": 2.734756097560976e-06, + "loss": 0.002, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 598 + }, + { + "completion_length": 4027.8333740234375, + "epoch": 0.0913109756097561, + "grad_norm": 0.05608999415948512, + "kl": 0.0361328125, + "learning_rate": 2.739329268292683e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 599 + }, + { + "completion_length": 2535.1666870117188, + "epoch": 0.09146341463414634, + "grad_norm": 0.10754987350938129, + "kl": 0.0518798828125, + "learning_rate": 2.7439024390243905e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 600 + }, + { + "completion_length": 3340.0, + "epoch": 0.09161585365853658, + "grad_norm": 10.019218281999656, + "kl": 0.505615234375, + "learning_rate": 2.7484756097560976e-06, + "loss": 0.0203, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 601 + }, + { + "completion_length": 2521.1666870117188, + "epoch": 0.09176829268292683, + "grad_norm": 0.18364755554952156, + "kl": 0.04736328125, + "learning_rate": 2.753048780487805e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 602 + }, + { + "completion_length": 2515.5, + "epoch": 0.09192073170731707, + "grad_norm": 0.13708097763109936, + "kl": 0.0601806640625, + "learning_rate": 2.757621951219512e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 603 + }, + { + "completion_length": 2930.0, + "epoch": 0.09207317073170732, + "grad_norm": 0.15276011433441122, + "kl": 0.053466796875, + "learning_rate": 2.7621951219512197e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 604 + }, + { + "completion_length": 2195.3333435058594, + "epoch": 0.09222560975609756, + "grad_norm": 0.2723721615955353, + "kl": 0.088623046875, + "learning_rate": 2.7667682926829268e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 605 + }, + { + "completion_length": 1240.0000915527344, + "epoch": 0.0923780487804878, + "grad_norm": 0.1902812297181246, + "kl": 0.084228515625, + "learning_rate": 2.7713414634146343e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 606 + }, + { + "completion_length": 261.8333435058594, + "epoch": 0.09253048780487805, + "grad_norm": 0.2541638214362519, + "kl": 0.133544921875, + "learning_rate": 2.7759146341463414e-06, + "loss": 0.0053, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 607 + }, + { + "completion_length": 1129.6666870117188, + "epoch": 0.09268292682926829, + "grad_norm": 0.4691790896682525, + "kl": 0.106201171875, + "learning_rate": 2.780487804878049e-06, + "loss": 0.0043, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 608 + }, + { + "completion_length": 217.83334350585938, + "epoch": 0.09283536585365854, + "grad_norm": 0.35871990342509147, + "kl": 0.1611328125, + "learning_rate": 2.785060975609756e-06, + "loss": 0.0064, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 609 + }, + { + "completion_length": 498.0000305175781, + "epoch": 0.09298780487804878, + "grad_norm": 0.47624000275843614, + "kl": 0.092041015625, + "learning_rate": 2.7896341463414635e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 610 + }, + { + "completion_length": 607.8333587646484, + "epoch": 0.09314024390243902, + "grad_norm": 0.3547388992889126, + "kl": 0.1396484375, + "learning_rate": 2.794207317073171e-06, + "loss": 0.0056, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 611 + }, + { + "completion_length": 543.5000152587891, + "epoch": 0.09329268292682927, + "grad_norm": 0.2977441134426671, + "kl": 0.115234375, + "learning_rate": 2.798780487804878e-06, + "loss": 0.0046, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 612 + }, + { + "completion_length": 543.0, + "epoch": 0.09344512195121951, + "grad_norm": 0.3148898012548764, + "kl": 0.0634765625, + "learning_rate": 2.8033536585365856e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 613 + }, + { + "completion_length": 339.8333435058594, + "epoch": 0.09359756097560976, + "grad_norm": 0.24227467711696699, + "kl": 0.099365234375, + "learning_rate": 2.807926829268293e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 614 + }, + { + "completion_length": 665.3333740234375, + "epoch": 0.09375, + "grad_norm": 0.5224626717586146, + "kl": 0.069580078125, + "learning_rate": 2.8125e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 615 + }, + { + "completion_length": 435.3333435058594, + "epoch": 0.09390243902439024, + "grad_norm": 0.1643989546138478, + "kl": 0.084716796875, + "learning_rate": 2.8170731707317073e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 616 + }, + { + "completion_length": 276.83333587646484, + "epoch": 0.09405487804878049, + "grad_norm": 0.2223351959548888, + "kl": 0.0966796875, + "learning_rate": 2.8216463414634148e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 617 + }, + { + "completion_length": 718.0, + "epoch": 0.09420731707317073, + "grad_norm": 0.13595798418290542, + "kl": 0.0791015625, + "learning_rate": 2.826219512195122e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 618 + }, + { + "completion_length": 248.00001525878906, + "epoch": 0.09435975609756098, + "grad_norm": 0.299696192118697, + "kl": 0.12646484375, + "learning_rate": 2.8307926829268294e-06, + "loss": 0.0051, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 619 + }, + { + "completion_length": 376.3333435058594, + "epoch": 0.09451219512195122, + "grad_norm": 0.26092602976253565, + "kl": 0.0986328125, + "learning_rate": 2.8353658536585365e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 620 + }, + { + "completion_length": 575.5000152587891, + "epoch": 0.09466463414634146, + "grad_norm": 0.31613628783118075, + "kl": 0.082275390625, + "learning_rate": 2.839939024390244e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 621 + }, + { + "completion_length": 1072.8333740234375, + "epoch": 0.09481707317073171, + "grad_norm": 3.3960113334420057, + "kl": 0.0869140625, + "learning_rate": 2.844512195121951e-06, + "loss": 0.0035, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 622 + }, + { + "completion_length": 1856.6666870117188, + "epoch": 0.09496951219512195, + "grad_norm": 0.1177566307056559, + "kl": 0.06884765625, + "learning_rate": 2.8490853658536586e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 623 + }, + { + "completion_length": 879.0000305175781, + "epoch": 0.0951219512195122, + "grad_norm": 0.16101850788803423, + "kl": 0.0577392578125, + "learning_rate": 2.8536585365853657e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 624 + }, + { + "completion_length": 1650.5, + "epoch": 0.09527439024390244, + "grad_norm": 0.11528336930149904, + "kl": 0.0667724609375, + "learning_rate": 2.858231707317073e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 625 + }, + { + "completion_length": 1110.0, + "epoch": 0.09542682926829268, + "grad_norm": 0.15840169610512328, + "kl": 0.0927734375, + "learning_rate": 2.8628048780487803e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 626 + }, + { + "completion_length": 2867.166748046875, + "epoch": 0.09557926829268293, + "grad_norm": 0.07519362876413424, + "kl": 0.07275390625, + "learning_rate": 2.8673780487804878e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 627 + }, + { + "completion_length": 1575.8333435058594, + "epoch": 0.09573170731707317, + "grad_norm": 0.11055531577716785, + "kl": 0.061767578125, + "learning_rate": 2.8719512195121953e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 628 + }, + { + "completion_length": 1179.5, + "epoch": 0.09588414634146342, + "grad_norm": 0.1105192646754378, + "kl": 0.0467529296875, + "learning_rate": 2.8765243902439024e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 629 + }, + { + "completion_length": 1623.3333740234375, + "epoch": 0.09603658536585366, + "grad_norm": 0.2030097680583145, + "kl": 0.0767822265625, + "learning_rate": 2.88109756097561e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 630 + }, + { + "completion_length": 1134.6666870117188, + "epoch": 0.0961890243902439, + "grad_norm": 0.12230250014640694, + "kl": 0.0638427734375, + "learning_rate": 2.8856707317073174e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 631 + }, + { + "completion_length": 1473.0000610351562, + "epoch": 0.09634146341463415, + "grad_norm": 0.12950769520796485, + "kl": 0.070556640625, + "learning_rate": 2.8902439024390245e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 632 + }, + { + "completion_length": 1089.5000305175781, + "epoch": 0.09649390243902439, + "grad_norm": 3.285322060749227, + "kl": 0.058837890625, + "learning_rate": 2.894817073170732e-06, + "loss": 0.0023, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 633 + }, + { + "completion_length": 1093.0000305175781, + "epoch": 0.09664634146341464, + "grad_norm": 0.1867965980991632, + "kl": 0.0540771484375, + "learning_rate": 2.899390243902439e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 634 + }, + { + "completion_length": 801.8333435058594, + "epoch": 0.09679878048780488, + "grad_norm": 0.18605069331253418, + "kl": 0.053955078125, + "learning_rate": 2.9039634146341466e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 635 + }, + { + "completion_length": 1259.3333740234375, + "epoch": 0.09695121951219512, + "grad_norm": 0.24830376697570927, + "kl": 0.10205078125, + "learning_rate": 2.9085365853658537e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 636 + }, + { + "completion_length": 575.1666793823242, + "epoch": 0.09710365853658537, + "grad_norm": 0.42873382868446097, + "kl": 0.10693359375, + "learning_rate": 2.913109756097561e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 637 + }, + { + "completion_length": 1030.5000305175781, + "epoch": 0.09725609756097561, + "grad_norm": 0.3068167337600241, + "kl": 0.0753173828125, + "learning_rate": 2.9176829268292683e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 638 + }, + { + "completion_length": 693.8333435058594, + "epoch": 0.09740853658536586, + "grad_norm": 0.3338309962252227, + "kl": 0.074951171875, + "learning_rate": 2.9222560975609758e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 639 + }, + { + "completion_length": 824.0000305175781, + "epoch": 0.0975609756097561, + "grad_norm": 1.644221955930463, + "kl": 0.0699462890625, + "learning_rate": 2.926829268292683e-06, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 640 + }, + { + "completion_length": 1849.3334350585938, + "epoch": 0.09771341463414634, + "grad_norm": 1.9619454803222085, + "kl": 0.071044921875, + "learning_rate": 2.9314024390243904e-06, + "loss": 0.0028, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 641 + }, + { + "completion_length": 1332.3333435058594, + "epoch": 0.09786585365853659, + "grad_norm": 0.12435389690438355, + "kl": 0.0438232421875, + "learning_rate": 2.9359756097560975e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 642 + }, + { + "completion_length": 902.6666870117188, + "epoch": 0.09801829268292683, + "grad_norm": 2.8357302249267664, + "kl": 0.0682373046875, + "learning_rate": 2.940548780487805e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 643 + }, + { + "completion_length": 792.5000305175781, + "epoch": 0.09817073170731708, + "grad_norm": 641.7212448053186, + "kl": 0.19921875, + "learning_rate": 2.945121951219512e-06, + "loss": 0.008, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 644 + }, + { + "completion_length": 968.5000305175781, + "epoch": 0.09832317073170732, + "grad_norm": 1.4768847932625107, + "kl": 0.0567626953125, + "learning_rate": 2.9496951219512196e-06, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 645 + }, + { + "completion_length": 595.6666870117188, + "epoch": 0.09847560975609757, + "grad_norm": 1.152391524983833, + "kl": 0.1376953125, + "learning_rate": 2.9542682926829267e-06, + "loss": 0.0055, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 646 + }, + { + "completion_length": 560.3333435058594, + "epoch": 0.09862804878048781, + "grad_norm": 2.777117828876149, + "kl": 0.084228515625, + "learning_rate": 2.958841463414634e-06, + "loss": 0.0034, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 647 + }, + { + "completion_length": 1309.3333435058594, + "epoch": 0.09878048780487805, + "grad_norm": 0.17356945742384153, + "kl": 0.080078125, + "learning_rate": 2.9634146341463417e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 648 + }, + { + "completion_length": 983.6666870117188, + "epoch": 0.0989329268292683, + "grad_norm": 0.10918032731269302, + "kl": 0.042236328125, + "learning_rate": 2.967987804878049e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 649 + }, + { + "completion_length": 651.8333740234375, + "epoch": 0.09908536585365854, + "grad_norm": 0.3096720554032609, + "kl": 0.0908203125, + "learning_rate": 2.9725609756097563e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 650 + }, + { + "completion_length": 493.3333435058594, + "epoch": 0.09923780487804879, + "grad_norm": 0.2574736657013272, + "kl": 0.107177734375, + "learning_rate": 2.977134146341464e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 651 + }, + { + "completion_length": 1663.5, + "epoch": 0.09939024390243903, + "grad_norm": 3.665896937940364, + "kl": 0.09765625, + "learning_rate": 2.981707317073171e-06, + "loss": 0.0039, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 652 + }, + { + "completion_length": 919.3333740234375, + "epoch": 0.09954268292682927, + "grad_norm": 0.1491637938999354, + "kl": 0.04736328125, + "learning_rate": 2.9862804878048784e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 653 + }, + { + "completion_length": 2706.6666870117188, + "epoch": 0.09969512195121952, + "grad_norm": 0.10655524730687527, + "kl": 0.0589599609375, + "learning_rate": 2.9908536585365855e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 654 + }, + { + "completion_length": 3138.83349609375, + "epoch": 0.09984756097560976, + "grad_norm": 0.22097319715141145, + "kl": 0.089111328125, + "learning_rate": 2.995426829268293e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 655 + }, + { + "completion_length": 1872.5000915527344, + "epoch": 0.1, + "grad_norm": 0.13886308193550875, + "kl": 0.071044921875, + "learning_rate": 3e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 656 + }, + { + "completion_length": 2087.166748046875, + "epoch": 0.10015243902439025, + "grad_norm": 0.09153490929283198, + "kl": 0.0574951171875, + "learning_rate": 2.9999997876421592e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 657 + }, + { + "completion_length": 3149.0001220703125, + "epoch": 0.1003048780487805, + "grad_norm": 0.10668811922759766, + "kl": 0.085205078125, + "learning_rate": 2.999999150568697e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 658 + }, + { + "completion_length": 2318.5001220703125, + "epoch": 0.10045731707317074, + "grad_norm": 1.1430580717391596, + "kl": 0.068603515625, + "learning_rate": 2.9999980887797937e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 659 + }, + { + "completion_length": 1728.8333740234375, + "epoch": 0.10060975609756098, + "grad_norm": 0.10633160702987289, + "kl": 0.075439453125, + "learning_rate": 2.9999966022757497e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 660 + }, + { + "completion_length": 1607.5, + "epoch": 0.10076219512195123, + "grad_norm": 0.26576210549651563, + "kl": 0.091552734375, + "learning_rate": 2.9999946910569862e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 661 + }, + { + "completion_length": 1173.3333740234375, + "epoch": 0.10091463414634147, + "grad_norm": 3.8540731730453706, + "kl": 0.12548828125, + "learning_rate": 2.9999923551240446e-06, + "loss": 0.005, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 662 + }, + { + "completion_length": 2403.166748046875, + "epoch": 0.10106707317073171, + "grad_norm": 0.10020432308918588, + "kl": 0.064208984375, + "learning_rate": 2.9999895944775858e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 663 + }, + { + "completion_length": 1649.3334350585938, + "epoch": 0.10121951219512196, + "grad_norm": 0.1864656139373003, + "kl": 0.08154296875, + "learning_rate": 2.9999864091183917e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 664 + }, + { + "completion_length": 569.8333587646484, + "epoch": 0.1013719512195122, + "grad_norm": 0.19907952847187269, + "kl": 0.07470703125, + "learning_rate": 2.999982799047364e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 665 + }, + { + "completion_length": 1199.3333740234375, + "epoch": 0.10152439024390245, + "grad_norm": 0.18279451030832952, + "kl": 0.057861328125, + "learning_rate": 2.999978764265525e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 666 + }, + { + "completion_length": 1721.1667175292969, + "epoch": 0.10167682926829268, + "grad_norm": 0.20882443133368672, + "kl": 0.1015625, + "learning_rate": 2.9999743047740175e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 667 + }, + { + "completion_length": 1414.0000610351562, + "epoch": 0.10182926829268292, + "grad_norm": 0.2573595878565368, + "kl": 0.09716796875, + "learning_rate": 2.999969420574104e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 668 + }, + { + "completion_length": 683.1666870117188, + "epoch": 0.10198170731707316, + "grad_norm": 2.966149927325114, + "kl": 0.122314453125, + "learning_rate": 2.9999641116671664e-06, + "loss": 0.0049, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 669 + }, + { + "completion_length": 1518.6666870117188, + "epoch": 0.10213414634146341, + "grad_norm": 0.16243991376604652, + "kl": 0.103271484375, + "learning_rate": 2.9999583780547095e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 670 + }, + { + "completion_length": 1128.3333740234375, + "epoch": 0.10228658536585365, + "grad_norm": 0.25788056354269573, + "kl": 0.0848388671875, + "learning_rate": 2.9999522197383557e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 671 + }, + { + "completion_length": 1465.6666870117188, + "epoch": 0.1024390243902439, + "grad_norm": 0.11199799961776889, + "kl": 0.0592041015625, + "learning_rate": 2.999945636719849e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 672 + }, + { + "completion_length": 856.3333435058594, + "epoch": 0.10259146341463414, + "grad_norm": 0.13640033376598315, + "kl": 0.06982421875, + "learning_rate": 2.9999386290010534e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 673 + }, + { + "completion_length": 642.0000152587891, + "epoch": 0.10274390243902438, + "grad_norm": 0.1845054189605759, + "kl": 0.107177734375, + "learning_rate": 2.999931196583953e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 674 + }, + { + "completion_length": 1554.5000915527344, + "epoch": 0.10289634146341463, + "grad_norm": 0.10532536734826814, + "kl": 0.081298828125, + "learning_rate": 2.999923339470652e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 675 + }, + { + "completion_length": 636.8333740234375, + "epoch": 0.10304878048780487, + "grad_norm": 2.9389948698197768, + "kl": 0.09912109375, + "learning_rate": 2.9999150576633756e-06, + "loss": 0.004, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 676 + }, + { + "completion_length": 1083.8333740234375, + "epoch": 0.10320121951219512, + "grad_norm": 0.13019549321035248, + "kl": 0.065185546875, + "learning_rate": 2.9999063511644685e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 677 + }, + { + "completion_length": 1907.5, + "epoch": 0.10335365853658536, + "grad_norm": 0.08872086224562305, + "kl": 0.061279296875, + "learning_rate": 2.999897219976396e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 678 + }, + { + "completion_length": 1806.8334350585938, + "epoch": 0.1035060975609756, + "grad_norm": 0.1727284292012855, + "kl": 0.1024169921875, + "learning_rate": 2.999887664101743e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 679 + }, + { + "completion_length": 1252.0000610351562, + "epoch": 0.10365853658536585, + "grad_norm": 0.13173302096458497, + "kl": 0.068359375, + "learning_rate": 2.999877683543216e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 680 + }, + { + "completion_length": 1053.5, + "epoch": 0.10381097560975609, + "grad_norm": 0.23367056280222803, + "kl": 0.0693359375, + "learning_rate": 2.99986727830364e-06, + "loss": 0.0028, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 681 + }, + { + "completion_length": 1055.1666870117188, + "epoch": 0.10396341463414634, + "grad_norm": 0.11782494351632544, + "kl": 0.0626220703125, + "learning_rate": 2.9998564483859627e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 682 + }, + { + "completion_length": 1683.6666870117188, + "epoch": 0.10411585365853658, + "grad_norm": 0.18655761575057186, + "kl": 0.078125, + "learning_rate": 2.999845193793249e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 683 + }, + { + "completion_length": 2203.8333740234375, + "epoch": 0.10426829268292682, + "grad_norm": 2.5482331361957953, + "kl": 0.0726318359375, + "learning_rate": 2.9998335145286857e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 684 + }, + { + "completion_length": 791.8333740234375, + "epoch": 0.10442073170731707, + "grad_norm": 0.14422525087983595, + "kl": 0.072265625, + "learning_rate": 2.9998214105955806e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 685 + }, + { + "completion_length": 920.3333740234375, + "epoch": 0.10457317073170731, + "grad_norm": 0.14349976248319396, + "kl": 0.0596923828125, + "learning_rate": 2.9998088819973605e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 686 + }, + { + "completion_length": 745.8333435058594, + "epoch": 0.10472560975609756, + "grad_norm": 2.273388902097512, + "kl": 0.063720703125, + "learning_rate": 2.9997959287375723e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 687 + }, + { + "completion_length": 1341.1666870117188, + "epoch": 0.1048780487804878, + "grad_norm": 0.12447148468676751, + "kl": 0.083251953125, + "learning_rate": 2.999782550819884e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 688 + }, + { + "completion_length": 1115.8333740234375, + "epoch": 0.10503048780487804, + "grad_norm": 0.261013556002395, + "kl": 0.06298828125, + "learning_rate": 2.999768748248084e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 689 + }, + { + "completion_length": 1213.6666870117188, + "epoch": 0.10518292682926829, + "grad_norm": 0.132324418641855, + "kl": 0.068359375, + "learning_rate": 2.9997545210260794e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 690 + }, + { + "completion_length": 1715.8333740234375, + "epoch": 0.10533536585365853, + "grad_norm": 0.1190472965891011, + "kl": 0.0462646484375, + "learning_rate": 2.999739869157899e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 691 + }, + { + "completion_length": 919.5000305175781, + "epoch": 0.10548780487804878, + "grad_norm": 0.15989689827638429, + "kl": 0.0703125, + "learning_rate": 2.9997247926476918e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 692 + }, + { + "completion_length": 1773.3333740234375, + "epoch": 0.10564024390243902, + "grad_norm": 0.2167120971361502, + "kl": 0.093505859375, + "learning_rate": 2.999709291499726e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 693 + }, + { + "completion_length": 1254.3333740234375, + "epoch": 0.10579268292682927, + "grad_norm": 0.15541024353870747, + "kl": 0.07275390625, + "learning_rate": 2.999693365718391e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 694 + }, + { + "completion_length": 1051.0000305175781, + "epoch": 0.10594512195121951, + "grad_norm": 2.865359570552944, + "kl": 0.0947265625, + "learning_rate": 2.9996770153081955e-06, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 695 + }, + { + "completion_length": 2115.666748046875, + "epoch": 0.10609756097560975, + "grad_norm": 0.3173442349496276, + "kl": 0.073974609375, + "learning_rate": 2.99966024027377e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 696 + }, + { + "completion_length": 2183.166748046875, + "epoch": 0.10625, + "grad_norm": 0.37843305355335727, + "kl": 0.066650390625, + "learning_rate": 2.9996430406198637e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 697 + }, + { + "completion_length": 1688.3333435058594, + "epoch": 0.10640243902439024, + "grad_norm": 20.029803178712385, + "kl": 0.49951171875, + "learning_rate": 2.9996254163513466e-06, + "loss": 0.02, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 698 + }, + { + "completion_length": 1034.5000305175781, + "epoch": 0.10655487804878049, + "grad_norm": 0.2640477130975413, + "kl": 0.07958984375, + "learning_rate": 2.999607367473209e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 699 + }, + { + "completion_length": 3338.5, + "epoch": 0.10670731707317073, + "grad_norm": 1.1637057844587333, + "kl": 0.0606689453125, + "learning_rate": 2.999588893990561e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 700 + }, + { + "completion_length": 2123.0000915527344, + "epoch": 0.10685975609756097, + "grad_norm": 0.10667605657610534, + "kl": 0.06005859375, + "learning_rate": 2.999569995908634e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 701 + }, + { + "completion_length": 1204.6666870117188, + "epoch": 0.10701219512195122, + "grad_norm": 1.9580453259623953, + "kl": 0.095458984375, + "learning_rate": 2.999550673232778e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 702 + }, + { + "completion_length": 1062.0000610351562, + "epoch": 0.10716463414634146, + "grad_norm": 1.0590855664299756, + "kl": 0.081787109375, + "learning_rate": 2.999530925968464e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 703 + }, + { + "completion_length": 1591.5, + "epoch": 0.1073170731707317, + "grad_norm": 0.13223839011854313, + "kl": 0.063720703125, + "learning_rate": 2.9995107541212846e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 704 + }, + { + "completion_length": 896.0000457763672, + "epoch": 0.10746951219512195, + "grad_norm": 0.16589223442676096, + "kl": 0.07568359375, + "learning_rate": 2.99949015769695e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 705 + }, + { + "completion_length": 1808.3334350585938, + "epoch": 0.1076219512195122, + "grad_norm": 0.13016685730895605, + "kl": 0.062255859375, + "learning_rate": 2.999469136701293e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 706 + }, + { + "completion_length": 1806.1666870117188, + "epoch": 0.10777439024390244, + "grad_norm": 0.1416532287268834, + "kl": 0.071044921875, + "learning_rate": 2.999447691140265e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 707 + }, + { + "completion_length": 2271.1666870117188, + "epoch": 0.10792682926829268, + "grad_norm": 0.19463420353526997, + "kl": 0.07275390625, + "learning_rate": 2.999425821019938e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 708 + }, + { + "completion_length": 2573.5, + "epoch": 0.10807926829268293, + "grad_norm": 0.109773836745363, + "kl": 0.0498046875, + "learning_rate": 2.999403526346504e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 709 + }, + { + "completion_length": 1682.3333740234375, + "epoch": 0.10823170731707317, + "grad_norm": 1.0065337007978863, + "kl": 0.0784912109375, + "learning_rate": 2.9993808071262773e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 710 + }, + { + "completion_length": 580.0000152587891, + "epoch": 0.10838414634146341, + "grad_norm": 0.14118810873782986, + "kl": 0.075927734375, + "learning_rate": 2.999357663365689e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 711 + }, + { + "completion_length": 2376.6666870117188, + "epoch": 0.10853658536585366, + "grad_norm": 0.12979749050590686, + "kl": 0.0667724609375, + "learning_rate": 2.999334095071293e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 712 + }, + { + "completion_length": 1907.6666870117188, + "epoch": 0.1086890243902439, + "grad_norm": 0.10091637425072977, + "kl": 0.0533447265625, + "learning_rate": 2.999310102249762e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 713 + }, + { + "completion_length": 1280.8333740234375, + "epoch": 0.10884146341463415, + "grad_norm": 42.277705633802434, + "kl": 0.1162109375, + "learning_rate": 2.9992856849078897e-06, + "loss": 0.0046, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 714 + }, + { + "completion_length": 2607.83349609375, + "epoch": 0.10899390243902439, + "grad_norm": 0.08432993545313831, + "kl": 0.0609130859375, + "learning_rate": 2.9992608430525895e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 715 + }, + { + "completion_length": 1191.6666870117188, + "epoch": 0.10914634146341463, + "grad_norm": 0.24152120219250953, + "kl": 0.093017578125, + "learning_rate": 2.999235576690896e-06, + "loss": 0.0037, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 716 + }, + { + "completion_length": 734.5000305175781, + "epoch": 0.10929878048780488, + "grad_norm": 0.2588108308804277, + "kl": 0.107177734375, + "learning_rate": 2.999209885829962e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 717 + }, + { + "completion_length": 879.3333435058594, + "epoch": 0.10945121951219512, + "grad_norm": 0.24379045169673774, + "kl": 0.089111328125, + "learning_rate": 2.9991837704770627e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 718 + }, + { + "completion_length": 1734.1666870117188, + "epoch": 0.10960365853658537, + "grad_norm": 0.1466501103912589, + "kl": 0.0596923828125, + "learning_rate": 2.9991572306395917e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 719 + }, + { + "completion_length": 944.5, + "epoch": 0.10975609756097561, + "grad_norm": 0.6536622261047682, + "kl": 0.1201171875, + "learning_rate": 2.9991302663250642e-06, + "loss": 0.0048, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 720 + }, + { + "completion_length": 846.3333740234375, + "epoch": 0.10990853658536585, + "grad_norm": 0.1585801138519843, + "kl": 0.0789794921875, + "learning_rate": 2.999102877541115e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 721 + }, + { + "completion_length": 1077.5000610351562, + "epoch": 0.1100609756097561, + "grad_norm": 0.14199751046838557, + "kl": 0.0704345703125, + "learning_rate": 2.9990750642954984e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 722 + }, + { + "completion_length": 1272.0000610351562, + "epoch": 0.11021341463414634, + "grad_norm": 0.10771646731658437, + "kl": 0.075927734375, + "learning_rate": 2.9990468265960904e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 723 + }, + { + "completion_length": 384.16668701171875, + "epoch": 0.11036585365853659, + "grad_norm": 0.27080050995221056, + "kl": 0.15185546875, + "learning_rate": 2.9990181644508856e-06, + "loss": 0.0061, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 724 + }, + { + "completion_length": 1462.1666870117188, + "epoch": 0.11051829268292683, + "grad_norm": 0.42978138914145225, + "kl": 0.076904296875, + "learning_rate": 2.998989077868e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 725 + }, + { + "completion_length": 470.6666717529297, + "epoch": 0.11067073170731707, + "grad_norm": 0.36365078776744253, + "kl": 0.14208984375, + "learning_rate": 2.9989595668556694e-06, + "loss": 0.0057, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 726 + }, + { + "completion_length": 987.3333740234375, + "epoch": 0.11082317073170732, + "grad_norm": 0.2043620309984938, + "kl": 0.087158203125, + "learning_rate": 2.9989296314222487e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 727 + }, + { + "completion_length": 1267.5, + "epoch": 0.11097560975609756, + "grad_norm": 0.19233548932174088, + "kl": 0.066650390625, + "learning_rate": 2.9988992715762147e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 728 + }, + { + "completion_length": 418.00001525878906, + "epoch": 0.1111280487804878, + "grad_norm": 0.17820477023492737, + "kl": 0.0810546875, + "learning_rate": 2.998868487326164e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 729 + }, + { + "completion_length": 666.5000305175781, + "epoch": 0.11128048780487805, + "grad_norm": 0.23185042815389745, + "kl": 0.107666015625, + "learning_rate": 2.9988372786808124e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 730 + }, + { + "completion_length": 1786.6666870117188, + "epoch": 0.1114329268292683, + "grad_norm": 0.14830976403788024, + "kl": 0.07080078125, + "learning_rate": 2.9988056456489967e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 731 + }, + { + "completion_length": 810.5000305175781, + "epoch": 0.11158536585365854, + "grad_norm": 0.13144766706952485, + "kl": 0.0888671875, + "learning_rate": 2.998773588239673e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 732 + }, + { + "completion_length": 1482.3333740234375, + "epoch": 0.11173780487804878, + "grad_norm": 8.528666568688648, + "kl": 0.1229248046875, + "learning_rate": 2.9987411064619184e-06, + "loss": 0.0049, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 733 + }, + { + "completion_length": 1109.6666870117188, + "epoch": 0.11189024390243903, + "grad_norm": 0.1474927873688132, + "kl": 0.094482421875, + "learning_rate": 2.9987082003249306e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 734 + }, + { + "completion_length": 1211.8333740234375, + "epoch": 0.11204268292682927, + "grad_norm": 0.2076801885231453, + "kl": 0.07421875, + "learning_rate": 2.998674869838026e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 735 + }, + { + "completion_length": 1367.8333740234375, + "epoch": 0.11219512195121951, + "grad_norm": 0.15115242545870172, + "kl": 0.091796875, + "learning_rate": 2.9986411150106423e-06, + "loss": 0.0037, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 736 + }, + { + "completion_length": 1763.3333740234375, + "epoch": 0.11234756097560976, + "grad_norm": 0.2618082646851062, + "kl": 0.07763671875, + "learning_rate": 2.998606935852337e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 737 + }, + { + "completion_length": 838.8333740234375, + "epoch": 0.1125, + "grad_norm": 0.1589802939200671, + "kl": 0.07421875, + "learning_rate": 2.998572332372787e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 738 + }, + { + "completion_length": 957.6666870117188, + "epoch": 0.11265243902439025, + "grad_norm": 0.12389543018940177, + "kl": 0.04638671875, + "learning_rate": 2.9985373045817905e-06, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 739 + }, + { + "completion_length": 1475.1667175292969, + "epoch": 0.11280487804878049, + "grad_norm": 0.18038150067846626, + "kl": 0.0648193359375, + "learning_rate": 2.998501852489266e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 740 + }, + { + "completion_length": 1235.5000610351562, + "epoch": 0.11295731707317073, + "grad_norm": 0.1988873447609937, + "kl": 0.081787109375, + "learning_rate": 2.998465976105251e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 741 + }, + { + "completion_length": 1117.8333587646484, + "epoch": 0.11310975609756098, + "grad_norm": 0.13650140648733652, + "kl": 0.0609130859375, + "learning_rate": 2.9984296754399037e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 742 + }, + { + "completion_length": 825.5, + "epoch": 0.11326219512195122, + "grad_norm": 0.6272368575964137, + "kl": 0.0626220703125, + "learning_rate": 2.9983929505035022e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 743 + }, + { + "completion_length": 734.8333435058594, + "epoch": 0.11341463414634147, + "grad_norm": 0.7201190938149266, + "kl": 0.11376953125, + "learning_rate": 2.9983558013064455e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 744 + }, + { + "completion_length": 1008.1666870117188, + "epoch": 0.11356707317073171, + "grad_norm": 0.17619990921683057, + "kl": 0.079345703125, + "learning_rate": 2.9983182278592517e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 745 + }, + { + "completion_length": 831.3333435058594, + "epoch": 0.11371951219512196, + "grad_norm": 0.1161817430414517, + "kl": 0.079345703125, + "learning_rate": 2.99828023017256e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 746 + }, + { + "completion_length": 844.3333740234375, + "epoch": 0.1138719512195122, + "grad_norm": 0.16658641737306953, + "kl": 0.089599609375, + "learning_rate": 2.998241808257128e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 747 + }, + { + "completion_length": 752.5, + "epoch": 0.11402439024390244, + "grad_norm": 0.13237097089698885, + "kl": 0.069091796875, + "learning_rate": 2.998202962123836e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 748 + }, + { + "completion_length": 1020.3333435058594, + "epoch": 0.11417682926829269, + "grad_norm": 0.13246796146854173, + "kl": 0.07177734375, + "learning_rate": 2.998163691783683e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 749 + }, + { + "completion_length": 774.1666870117188, + "epoch": 0.11432926829268293, + "grad_norm": 0.1614837966694169, + "kl": 0.078857421875, + "learning_rate": 2.9981239972477866e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 750 + }, + { + "completion_length": 976.5000305175781, + "epoch": 0.11448170731707318, + "grad_norm": 0.11309513851775631, + "kl": 0.0596923828125, + "learning_rate": 2.9980838785273876e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 751 + }, + { + "completion_length": 1380.0000915527344, + "epoch": 0.11463414634146342, + "grad_norm": 0.10404549681502508, + "kl": 0.0474853515625, + "learning_rate": 2.998043335633845e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 752 + }, + { + "completion_length": 1318.166748046875, + "epoch": 0.11478658536585366, + "grad_norm": 0.08812744194713858, + "kl": 0.05078125, + "learning_rate": 2.9980023685786383e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 753 + }, + { + "completion_length": 694.6666870117188, + "epoch": 0.11493902439024391, + "grad_norm": 0.31122985005703646, + "kl": 0.078857421875, + "learning_rate": 2.9979609773733663e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 754 + }, + { + "completion_length": 709.0000457763672, + "epoch": 0.11509146341463415, + "grad_norm": 0.2483951443925594, + "kl": 0.096435546875, + "learning_rate": 2.99791916202975e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 755 + }, + { + "completion_length": 1843.6666870117188, + "epoch": 0.1152439024390244, + "grad_norm": 0.09566162327619088, + "kl": 0.0576171875, + "learning_rate": 2.997876922559628e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 756 + }, + { + "completion_length": 787.1666870117188, + "epoch": 0.11539634146341464, + "grad_norm": 0.12933920251873784, + "kl": 0.0665283203125, + "learning_rate": 2.9978342589749606e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 757 + }, + { + "completion_length": 875.6666870117188, + "epoch": 0.11554878048780488, + "grad_norm": 0.10653806986612901, + "kl": 0.069091796875, + "learning_rate": 2.9977911712878275e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 758 + }, + { + "completion_length": 1941.1666870117188, + "epoch": 0.11570121951219513, + "grad_norm": 0.08775993008516839, + "kl": 0.0533447265625, + "learning_rate": 2.99774765951043e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 759 + }, + { + "completion_length": 856.6667175292969, + "epoch": 0.11585365853658537, + "grad_norm": 0.09669191293056033, + "kl": 0.059326171875, + "learning_rate": 2.997703723655086e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 760 + }, + { + "completion_length": 1441.3333740234375, + "epoch": 0.11600609756097562, + "grad_norm": 0.11893732478036158, + "kl": 0.0673828125, + "learning_rate": 2.997659363734237e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 761 + }, + { + "completion_length": 806.5000305175781, + "epoch": 0.11615853658536586, + "grad_norm": 0.1441894192800728, + "kl": 0.07470703125, + "learning_rate": 2.9976145797604433e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 762 + }, + { + "completion_length": 783.1666870117188, + "epoch": 0.1163109756097561, + "grad_norm": 0.18736510284146093, + "kl": 0.08056640625, + "learning_rate": 2.9975693717463845e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 763 + }, + { + "completion_length": 725.1666870117188, + "epoch": 0.11646341463414635, + "grad_norm": 0.12964521442853738, + "kl": 0.05908203125, + "learning_rate": 2.9975237397048618e-06, + "loss": 0.0024, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 764 + }, + { + "completion_length": 1260.6667175292969, + "epoch": 0.11661585365853659, + "grad_norm": 0.14522444063099257, + "kl": 0.060791015625, + "learning_rate": 2.997477683648795e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 765 + }, + { + "completion_length": 941.0000305175781, + "epoch": 0.11676829268292684, + "grad_norm": 0.09331156765358785, + "kl": 0.0673828125, + "learning_rate": 2.997431203591225e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 766 + }, + { + "completion_length": 1058.3333435058594, + "epoch": 0.11692073170731708, + "grad_norm": 0.10859113744306667, + "kl": 0.06884765625, + "learning_rate": 2.997384299545312e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 767 + }, + { + "completion_length": 1129.3333740234375, + "epoch": 0.11707317073170732, + "grad_norm": 0.1494422140640862, + "kl": 0.0653076171875, + "learning_rate": 2.9973369715243363e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 768 + }, + { + "completion_length": 1133.6667175292969, + "epoch": 0.11722560975609755, + "grad_norm": 0.12774837846666168, + "kl": 0.068603515625, + "learning_rate": 2.9972892195416996e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 769 + }, + { + "completion_length": 1085.3333740234375, + "epoch": 0.1173780487804878, + "grad_norm": 0.15503070844261024, + "kl": 0.056884765625, + "learning_rate": 2.997241043610922e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 770 + }, + { + "completion_length": 1048.6667175292969, + "epoch": 0.11753048780487804, + "grad_norm": 0.18971872512476776, + "kl": 0.0465087890625, + "learning_rate": 2.9971924437456436e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 771 + }, + { + "completion_length": 1374.0000610351562, + "epoch": 0.11768292682926829, + "grad_norm": 0.10356069550745907, + "kl": 0.0643310546875, + "learning_rate": 2.997143419959626e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 772 + }, + { + "completion_length": 927.0, + "epoch": 0.11783536585365853, + "grad_norm": 0.16365350644033397, + "kl": 0.056640625, + "learning_rate": 2.99709397226675e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 773 + }, + { + "completion_length": 693.1666870117188, + "epoch": 0.11798780487804877, + "grad_norm": 2.5403278467447206, + "kl": 0.072021484375, + "learning_rate": 2.9970441006810155e-06, + "loss": 0.0029, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 774 + }, + { + "completion_length": 874.1666870117188, + "epoch": 0.11814024390243902, + "grad_norm": 0.1304502547458923, + "kl": 0.0640869140625, + "learning_rate": 2.996993805216544e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 775 + }, + { + "completion_length": 729.3333740234375, + "epoch": 0.11829268292682926, + "grad_norm": 0.1655404441891715, + "kl": 0.0579833984375, + "learning_rate": 2.996943085887577e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 776 + }, + { + "completion_length": 963.8333740234375, + "epoch": 0.1184451219512195, + "grad_norm": 0.11877553842990166, + "kl": 0.0648193359375, + "learning_rate": 2.996891942708474e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 777 + }, + { + "completion_length": 1524.3333740234375, + "epoch": 0.11859756097560975, + "grad_norm": 0.12766904344519522, + "kl": 0.07470703125, + "learning_rate": 2.996840375693716e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 778 + }, + { + "completion_length": 1218.0000610351562, + "epoch": 0.11875, + "grad_norm": 0.15564834064756478, + "kl": 0.0555419921875, + "learning_rate": 2.996788384857905e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 779 + }, + { + "completion_length": 1527.3333740234375, + "epoch": 0.11890243902439024, + "grad_norm": 0.1207877073131374, + "kl": 0.054443359375, + "learning_rate": 2.9967359702157616e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 780 + }, + { + "completion_length": 2391.1666870117188, + "epoch": 0.11905487804878048, + "grad_norm": 0.10583679092579701, + "kl": 0.0469970703125, + "learning_rate": 2.996683131782126e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 781 + }, + { + "completion_length": 1306.6666870117188, + "epoch": 0.11920731707317073, + "grad_norm": 0.11881578590663751, + "kl": 0.062255859375, + "learning_rate": 2.9966298695719595e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 782 + }, + { + "completion_length": 2085.5, + "epoch": 0.11935975609756097, + "grad_norm": 2.3942359657446426, + "kl": 0.0567626953125, + "learning_rate": 2.9965761836003426e-06, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 783 + }, + { + "completion_length": 1828.8334350585938, + "epoch": 0.11951219512195121, + "grad_norm": 0.11701564558460781, + "kl": 0.0479736328125, + "learning_rate": 2.996522073882477e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 784 + }, + { + "completion_length": 1225.6666870117188, + "epoch": 0.11966463414634146, + "grad_norm": 0.09958034500007218, + "kl": 0.054931640625, + "learning_rate": 2.996467540433682e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 785 + }, + { + "completion_length": 2094.166748046875, + "epoch": 0.1198170731707317, + "grad_norm": 0.15322417810119496, + "kl": 0.066650390625, + "learning_rate": 2.9964125832694003e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 786 + }, + { + "completion_length": 782.5000457763672, + "epoch": 0.11996951219512195, + "grad_norm": 0.12761189196717965, + "kl": 0.06298828125, + "learning_rate": 2.9963572024051915e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 787 + }, + { + "completion_length": 999.6666870117188, + "epoch": 0.12012195121951219, + "grad_norm": 0.12842792288445912, + "kl": 0.04931640625, + "learning_rate": 2.996301397856737e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 788 + }, + { + "completion_length": 890.0000305175781, + "epoch": 0.12027439024390243, + "grad_norm": 0.22929617566568053, + "kl": 0.066650390625, + "learning_rate": 2.9962451696398365e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 789 + }, + { + "completion_length": 1148.0000305175781, + "epoch": 0.12042682926829268, + "grad_norm": 0.13647821369575036, + "kl": 0.0577392578125, + "learning_rate": 2.996188517770411e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 790 + }, + { + "completion_length": 1405.5, + "epoch": 0.12057926829268292, + "grad_norm": 0.10885398228912746, + "kl": 0.0545654296875, + "learning_rate": 2.996131442264502e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 791 + }, + { + "completion_length": 980.8333435058594, + "epoch": 0.12073170731707317, + "grad_norm": 0.10961810043134117, + "kl": 0.0504150390625, + "learning_rate": 2.9960739431382697e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 792 + }, + { + "completion_length": 699.3333435058594, + "epoch": 0.12088414634146341, + "grad_norm": 0.15853269771199227, + "kl": 0.0701904296875, + "learning_rate": 2.996016020407994e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 793 + }, + { + "completion_length": 666.0000152587891, + "epoch": 0.12103658536585366, + "grad_norm": 0.22560928896270868, + "kl": 0.068115234375, + "learning_rate": 2.9959576740900758e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 794 + }, + { + "completion_length": 1010.0000305175781, + "epoch": 0.1211890243902439, + "grad_norm": 0.1383265069234329, + "kl": 0.0582275390625, + "learning_rate": 2.9958989042010355e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 795 + }, + { + "completion_length": 1609.166748046875, + "epoch": 0.12134146341463414, + "grad_norm": 0.08055672662793116, + "kl": 0.041015625, + "learning_rate": 2.9958397107575134e-06, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 796 + }, + { + "completion_length": 611.8333435058594, + "epoch": 0.12149390243902439, + "grad_norm": 0.1341361019188414, + "kl": 0.0504150390625, + "learning_rate": 2.99578009377627e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 797 + }, + { + "completion_length": 683.6666870117188, + "epoch": 0.12164634146341463, + "grad_norm": 0.16299352192690195, + "kl": 0.0633544921875, + "learning_rate": 2.9957200532741847e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 798 + }, + { + "completion_length": 725.1666870117188, + "epoch": 0.12179878048780488, + "grad_norm": 0.11125686796684711, + "kl": 0.061279296875, + "learning_rate": 2.9956595892682585e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 799 + }, + { + "completion_length": 726.8333740234375, + "epoch": 0.12195121951219512, + "grad_norm": 0.22390595210872408, + "kl": 0.084716796875, + "learning_rate": 2.9955987017756107e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 800 + }, + { + "completion_length": 1248.8333740234375, + "epoch": 0.12210365853658536, + "grad_norm": 0.07639756031280641, + "kl": 0.046142578125, + "learning_rate": 2.995537390813482e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 801 + }, + { + "completion_length": 862.5000305175781, + "epoch": 0.12225609756097561, + "grad_norm": 1.748062002890352, + "kl": 0.061279296875, + "learning_rate": 2.9954756563992313e-06, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 802 + }, + { + "completion_length": 1305.8333740234375, + "epoch": 0.12240853658536585, + "grad_norm": 0.11327074342227351, + "kl": 0.0604248046875, + "learning_rate": 2.995413498550339e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 803 + }, + { + "completion_length": 740.1666870117188, + "epoch": 0.1225609756097561, + "grad_norm": 0.10994696415785135, + "kl": 0.05322265625, + "learning_rate": 2.9953509172844047e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 804 + }, + { + "completion_length": 956.3333740234375, + "epoch": 0.12271341463414634, + "grad_norm": 0.22035474796082413, + "kl": 0.0665283203125, + "learning_rate": 2.9952879126191473e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 805 + }, + { + "completion_length": 968.1666870117188, + "epoch": 0.12286585365853658, + "grad_norm": 0.12229980642935677, + "kl": 0.0533447265625, + "learning_rate": 2.9952244845724067e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 806 + }, + { + "completion_length": 1934.1666870117188, + "epoch": 0.12301829268292683, + "grad_norm": 0.08862415789644766, + "kl": 0.0491943359375, + "learning_rate": 2.995160633162142e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 807 + }, + { + "completion_length": 2472.8333740234375, + "epoch": 0.12317073170731707, + "grad_norm": 0.08642103024975478, + "kl": 0.0540771484375, + "learning_rate": 2.9950963584064327e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 808 + }, + { + "completion_length": 936.0, + "epoch": 0.12332317073170732, + "grad_norm": 0.14041914853667045, + "kl": 0.05615234375, + "learning_rate": 2.995031660323477e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 809 + }, + { + "completion_length": 1632.5000610351562, + "epoch": 0.12347560975609756, + "grad_norm": 0.2117872442628367, + "kl": 0.0655517578125, + "learning_rate": 2.9949665389315944e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 810 + }, + { + "completion_length": 511.3333435058594, + "epoch": 0.1236280487804878, + "grad_norm": 0.1469811999359725, + "kl": 0.058349609375, + "learning_rate": 2.9949009942492233e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 811 + }, + { + "completion_length": 1857.0000610351562, + "epoch": 0.12378048780487805, + "grad_norm": 0.10376066040463636, + "kl": 0.05322265625, + "learning_rate": 2.9948350262949224e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 812 + }, + { + "completion_length": 952.8333740234375, + "epoch": 0.12393292682926829, + "grad_norm": 0.2239720488626106, + "kl": 0.068603515625, + "learning_rate": 2.99476863508737e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 813 + }, + { + "completion_length": 1143.166748046875, + "epoch": 0.12408536585365854, + "grad_norm": 0.1724320017960135, + "kl": 0.0675048828125, + "learning_rate": 2.994701820645365e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 814 + }, + { + "completion_length": 1539.3333435058594, + "epoch": 0.12423780487804878, + "grad_norm": 0.10421420007241494, + "kl": 0.0545654296875, + "learning_rate": 2.9946345829878246e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 815 + }, + { + "completion_length": 2056.5000610351562, + "epoch": 0.12439024390243902, + "grad_norm": 0.14820076231106025, + "kl": 0.0601806640625, + "learning_rate": 2.9945669221337873e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 816 + }, + { + "completion_length": 816.3333435058594, + "epoch": 0.12454268292682927, + "grad_norm": 0.22402339700656915, + "kl": 0.081298828125, + "learning_rate": 2.99449883810241e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 817 + }, + { + "completion_length": 1066.5, + "epoch": 0.12469512195121951, + "grad_norm": 0.08115618952459773, + "kl": 0.0555419921875, + "learning_rate": 2.9944303309129717e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 818 + }, + { + "completion_length": 800.6666870117188, + "epoch": 0.12484756097560976, + "grad_norm": 0.19052245365623616, + "kl": 0.071044921875, + "learning_rate": 2.9943614005848684e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 819 + }, + { + "completion_length": 1231.0000610351562, + "epoch": 0.125, + "grad_norm": 0.7737794506005033, + "kl": 0.0640869140625, + "learning_rate": 2.994292047137618e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 820 + }, + { + "completion_length": 2128.1666870117188, + "epoch": 0.12515243902439024, + "grad_norm": 0.11694723401696305, + "kl": 0.066162109375, + "learning_rate": 2.9942222705908577e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 821 + }, + { + "completion_length": 507.5000305175781, + "epoch": 0.1253048780487805, + "grad_norm": 0.18737260321585378, + "kl": 0.0899658203125, + "learning_rate": 2.994152070964344e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 822 + }, + { + "completion_length": 1233.8333740234375, + "epoch": 0.12545731707317073, + "grad_norm": 0.12799523799649118, + "kl": 0.06884765625, + "learning_rate": 2.9940814482779533e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 823 + }, + { + "completion_length": 955.8333435058594, + "epoch": 0.12560975609756098, + "grad_norm": 0.103894541294793, + "kl": 0.051513671875, + "learning_rate": 2.994010402551682e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 824 + }, + { + "completion_length": 503.3333435058594, + "epoch": 0.12576219512195122, + "grad_norm": 0.21784065051944548, + "kl": 0.077392578125, + "learning_rate": 2.993938933805647e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 825 + }, + { + "completion_length": 791.1666870117188, + "epoch": 0.12591463414634146, + "grad_norm": 1.0086466915860037, + "kl": 0.0687255859375, + "learning_rate": 2.993867042060083e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 826 + }, + { + "completion_length": 591.6666717529297, + "epoch": 0.1260670731707317, + "grad_norm": 0.2074035920323988, + "kl": 0.072265625, + "learning_rate": 2.9937947273353463e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 827 + }, + { + "completion_length": 1156.3333740234375, + "epoch": 0.12621951219512195, + "grad_norm": 0.10074480471704983, + "kl": 0.047119140625, + "learning_rate": 2.993721989651913e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 828 + }, + { + "completion_length": 628.0000305175781, + "epoch": 0.1263719512195122, + "grad_norm": 0.20907003726370182, + "kl": 0.0621337890625, + "learning_rate": 2.993648829030377e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 829 + }, + { + "completion_length": 1010.1667175292969, + "epoch": 0.12652439024390244, + "grad_norm": 0.22805824938410413, + "kl": 0.06298828125, + "learning_rate": 2.9935752454914543e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 830 + }, + { + "completion_length": 1412.5000610351562, + "epoch": 0.12667682926829268, + "grad_norm": 0.10653092821279905, + "kl": 0.059326171875, + "learning_rate": 2.9935012390559797e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 831 + }, + { + "completion_length": 964.0000610351562, + "epoch": 0.12682926829268293, + "grad_norm": 0.14301825952252947, + "kl": 0.0567626953125, + "learning_rate": 2.9934268097449068e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 832 + }, + { + "completion_length": 936.5000305175781, + "epoch": 0.12698170731707317, + "grad_norm": 0.1155640310096733, + "kl": 0.065673828125, + "learning_rate": 2.9933519575793105e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 833 + }, + { + "completion_length": 897.3333435058594, + "epoch": 0.12713414634146342, + "grad_norm": 0.09308617237146122, + "kl": 0.0546875, + "learning_rate": 2.9932766825803845e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 834 + }, + { + "completion_length": 778.1666870117188, + "epoch": 0.12728658536585366, + "grad_norm": 0.16233268952544358, + "kl": 0.067626953125, + "learning_rate": 2.9932009847694424e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 835 + }, + { + "completion_length": 1094.6667175292969, + "epoch": 0.1274390243902439, + "grad_norm": 0.24568384837275506, + "kl": 0.0791015625, + "learning_rate": 2.9931248641679173e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 836 + }, + { + "completion_length": 933.8333435058594, + "epoch": 0.12759146341463415, + "grad_norm": 0.26701883672775045, + "kl": 0.06103515625, + "learning_rate": 2.993048320797363e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 837 + }, + { + "completion_length": 2496.0001220703125, + "epoch": 0.1277439024390244, + "grad_norm": 0.12723829430572814, + "kl": 0.054443359375, + "learning_rate": 2.9929713546794517e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 838 + }, + { + "completion_length": 1212.5000305175781, + "epoch": 0.12789634146341464, + "grad_norm": 0.08013117219462444, + "kl": 0.04736328125, + "learning_rate": 2.9928939658359764e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 839 + }, + { + "completion_length": 1656.5, + "epoch": 0.12804878048780488, + "grad_norm": 0.09471822528285401, + "kl": 0.0572509765625, + "learning_rate": 2.9928161542888487e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 840 + }, + { + "completion_length": 994.8333740234375, + "epoch": 0.12820121951219512, + "grad_norm": 0.14838610163578364, + "kl": 0.071044921875, + "learning_rate": 2.9927379200601005e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 841 + }, + { + "completion_length": 1323.8333740234375, + "epoch": 0.12835365853658537, + "grad_norm": 1.586636162549454, + "kl": 0.048828125, + "learning_rate": 2.9926592631718836e-06, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.1666666716337204, + "step": 842 + }, + { + "completion_length": 639.0000152587891, + "epoch": 0.1285060975609756, + "grad_norm": 0.1916767547342549, + "kl": 0.0736083984375, + "learning_rate": 2.9925801836464693e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 843 + }, + { + "completion_length": 536.1666870117188, + "epoch": 0.12865853658536586, + "grad_norm": 0.19654580060837895, + "kl": 0.0521240234375, + "learning_rate": 2.9925006815062483e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 844 + }, + { + "completion_length": 1598.5000915527344, + "epoch": 0.1288109756097561, + "grad_norm": 0.16854967672625157, + "kl": 0.0501708984375, + "learning_rate": 2.9924207567737306e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 845 + }, + { + "completion_length": 500.66668701171875, + "epoch": 0.12896341463414634, + "grad_norm": 0.14704134746632763, + "kl": 0.0859375, + "learning_rate": 2.9923404094715476e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 846 + }, + { + "completion_length": 1661.8333740234375, + "epoch": 0.1291158536585366, + "grad_norm": 0.11765130864274442, + "kl": 0.066162109375, + "learning_rate": 2.992259639622448e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 847 + }, + { + "completion_length": 1390.1666870117188, + "epoch": 0.12926829268292683, + "grad_norm": 0.21200380033914648, + "kl": 0.068603515625, + "learning_rate": 2.9921784472493023e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 848 + }, + { + "completion_length": 1117.166748046875, + "epoch": 0.12942073170731708, + "grad_norm": 11.148878362603032, + "kl": 0.151611328125, + "learning_rate": 2.9920968323750984e-06, + "loss": 0.006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 849 + }, + { + "completion_length": 949.1667175292969, + "epoch": 0.12957317073170732, + "grad_norm": 0.10049341080388018, + "kl": 0.048828125, + "learning_rate": 2.9920147950229464e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 850 + }, + { + "completion_length": 1031.8333740234375, + "epoch": 0.12972560975609757, + "grad_norm": 0.09422441693031854, + "kl": 0.047119140625, + "learning_rate": 2.9919323352160736e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 851 + }, + { + "completion_length": 1066.3333740234375, + "epoch": 0.1298780487804878, + "grad_norm": 1.1785613522635856, + "kl": 0.074951171875, + "learning_rate": 2.9918494529778285e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 852 + }, + { + "completion_length": 1461.6666870117188, + "epoch": 0.13003048780487805, + "grad_norm": 0.09919096105310062, + "kl": 0.0693359375, + "learning_rate": 2.991766148331678e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 853 + }, + { + "completion_length": 842.0, + "epoch": 0.1301829268292683, + "grad_norm": 0.16208109799729956, + "kl": 0.05517578125, + "learning_rate": 2.9916824213012106e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 854 + }, + { + "completion_length": 1005.1666870117188, + "epoch": 0.13033536585365854, + "grad_norm": 0.07344942539038915, + "kl": 0.0372314453125, + "learning_rate": 2.991598271910132e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 855 + }, + { + "completion_length": 1259.3333740234375, + "epoch": 0.13048780487804879, + "grad_norm": 0.17783937720573612, + "kl": 0.062744140625, + "learning_rate": 2.9915137001822686e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 856 + }, + { + "completion_length": 1338.3333740234375, + "epoch": 0.13064024390243903, + "grad_norm": 0.17656716234414488, + "kl": 0.071044921875, + "learning_rate": 2.991428706141567e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 857 + }, + { + "completion_length": 1037.6666870117188, + "epoch": 0.13079268292682927, + "grad_norm": 0.10653249380160239, + "kl": 0.051025390625, + "learning_rate": 2.9913432898120927e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 858 + }, + { + "completion_length": 1598.6666870117188, + "epoch": 0.13094512195121952, + "grad_norm": 2.0633216673308272, + "kl": 0.05712890625, + "learning_rate": 2.99125745121803e-06, + "loss": 0.0023, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 859 + }, + { + "completion_length": 1470.3333740234375, + "epoch": 0.13109756097560976, + "grad_norm": 0.09928356108228799, + "kl": 0.0443115234375, + "learning_rate": 2.9911711903836845e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 860 + }, + { + "completion_length": 898.3333435058594, + "epoch": 0.13125, + "grad_norm": 0.1787188963199907, + "kl": 0.073974609375, + "learning_rate": 2.9910845073334793e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 861 + }, + { + "completion_length": 2220.666748046875, + "epoch": 0.13140243902439025, + "grad_norm": 0.11690887348082389, + "kl": 0.05517578125, + "learning_rate": 2.9909974020919598e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 862 + }, + { + "completion_length": 1022.1667175292969, + "epoch": 0.1315548780487805, + "grad_norm": 0.15731817511621576, + "kl": 0.0579833984375, + "learning_rate": 2.9909098746837877e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 863 + }, + { + "completion_length": 1267.0000610351562, + "epoch": 0.13170731707317074, + "grad_norm": 0.11643898819630724, + "kl": 0.0506591796875, + "learning_rate": 2.9908219251337465e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 864 + }, + { + "completion_length": 884.3333435058594, + "epoch": 0.13185975609756098, + "grad_norm": 0.11230589998642622, + "kl": 0.0543212890625, + "learning_rate": 2.990733553466739e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 865 + }, + { + "completion_length": 1293.5, + "epoch": 0.13201219512195123, + "grad_norm": 0.08670451972316447, + "kl": 0.0516357421875, + "learning_rate": 2.9906447597077863e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 866 + }, + { + "completion_length": 1050.8333435058594, + "epoch": 0.13216463414634147, + "grad_norm": 0.1086545356370489, + "kl": 0.0389404296875, + "learning_rate": 2.9905555438820302e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 867 + }, + { + "completion_length": 717.3333740234375, + "epoch": 0.1323170731707317, + "grad_norm": 0.1323929788616386, + "kl": 0.0594482421875, + "learning_rate": 2.9904659060147314e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 868 + }, + { + "completion_length": 591.1666870117188, + "epoch": 0.13246951219512196, + "grad_norm": 0.1714141493572481, + "kl": 0.082275390625, + "learning_rate": 2.990375846131271e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 869 + }, + { + "completion_length": 855.0000305175781, + "epoch": 0.1326219512195122, + "grad_norm": 0.12162065179707132, + "kl": 0.058349609375, + "learning_rate": 2.990285364257148e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 870 + }, + { + "completion_length": 2103.8333740234375, + "epoch": 0.13277439024390245, + "grad_norm": 0.09400898384612696, + "kl": 0.0411376953125, + "learning_rate": 2.9901944604179824e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 871 + }, + { + "completion_length": 1158.3333740234375, + "epoch": 0.1329268292682927, + "grad_norm": 0.342120667582827, + "kl": 0.06884765625, + "learning_rate": 2.9901031346395125e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 872 + }, + { + "completion_length": 910.3333740234375, + "epoch": 0.13307926829268293, + "grad_norm": 0.15567112149732157, + "kl": 0.0616455078125, + "learning_rate": 2.9900113869475975e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 873 + }, + { + "completion_length": 1392.6666870117188, + "epoch": 0.13323170731707318, + "grad_norm": 0.4519917888344702, + "kl": 0.06396484375, + "learning_rate": 2.989919217368214e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 874 + }, + { + "completion_length": 2551.0001220703125, + "epoch": 0.13338414634146342, + "grad_norm": 0.07722984841142626, + "kl": 0.0479736328125, + "learning_rate": 2.98982662592746e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 875 + }, + { + "completion_length": 727.3333740234375, + "epoch": 0.13353658536585367, + "grad_norm": 0.09117820156457888, + "kl": 0.052001953125, + "learning_rate": 2.9897336126515525e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 876 + }, + { + "completion_length": 850.0, + "epoch": 0.1336890243902439, + "grad_norm": 2.2054913939648144, + "kl": 0.0458984375, + "learning_rate": 2.989640177566827e-06, + "loss": 0.0018, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 877 + }, + { + "completion_length": 2059.8334350585938, + "epoch": 0.13384146341463415, + "grad_norm": 0.1143086317549927, + "kl": 0.05712890625, + "learning_rate": 2.9895463206997392e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 878 + }, + { + "completion_length": 1337.666748046875, + "epoch": 0.1339939024390244, + "grad_norm": 0.16503639886793336, + "kl": 0.069091796875, + "learning_rate": 2.989452042076864e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 879 + }, + { + "completion_length": 1943.666748046875, + "epoch": 0.13414634146341464, + "grad_norm": 0.08884891415801537, + "kl": 0.045654296875, + "learning_rate": 2.9893573417248957e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 880 + }, + { + "completion_length": 566.6666870117188, + "epoch": 0.1342987804878049, + "grad_norm": 0.14100861573042095, + "kl": 0.078369140625, + "learning_rate": 2.989262219670649e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 881 + }, + { + "completion_length": 801.3333435058594, + "epoch": 0.13445121951219513, + "grad_norm": 2.140478197840059, + "kl": 0.060791015625, + "learning_rate": 2.989166675941056e-06, + "loss": 0.0024, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 882 + }, + { + "completion_length": 726.3333435058594, + "epoch": 0.13460365853658537, + "grad_norm": 0.1450024524170428, + "kl": 0.06298828125, + "learning_rate": 2.98907071056317e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 883 + }, + { + "completion_length": 1884.3333740234375, + "epoch": 0.13475609756097562, + "grad_norm": 2.0813334176268965, + "kl": 0.0489501953125, + "learning_rate": 2.9889743235641627e-06, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 884 + }, + { + "completion_length": 886.8333435058594, + "epoch": 0.13490853658536586, + "grad_norm": 2.443445711100652, + "kl": 0.0753173828125, + "learning_rate": 2.9888775149713256e-06, + "loss": 0.003, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 885 + }, + { + "completion_length": 1281.6666870117188, + "epoch": 0.1350609756097561, + "grad_norm": 0.10835706140733026, + "kl": 0.0836181640625, + "learning_rate": 2.9887802848120697e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 886 + }, + { + "completion_length": 1711.3333740234375, + "epoch": 0.13521341463414635, + "grad_norm": 2.8371426264077355, + "kl": 0.064453125, + "learning_rate": 2.988682633113925e-06, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 887 + }, + { + "completion_length": 1179.5000610351562, + "epoch": 0.1353658536585366, + "grad_norm": 0.1676712391538588, + "kl": 0.05078125, + "learning_rate": 2.98858455990454e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 888 + }, + { + "completion_length": 1643.8334350585938, + "epoch": 0.13551829268292684, + "grad_norm": 0.3598570632386753, + "kl": 0.0654296875, + "learning_rate": 2.9884860652116846e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 889 + }, + { + "completion_length": 1070.0000610351562, + "epoch": 0.13567073170731708, + "grad_norm": 0.16602466240212513, + "kl": 0.075927734375, + "learning_rate": 2.9883871490632474e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 890 + }, + { + "completion_length": 1052.6666870117188, + "epoch": 0.13582317073170733, + "grad_norm": 0.1641432297688507, + "kl": 0.0926513671875, + "learning_rate": 2.9882878114872347e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 891 + }, + { + "completion_length": 2091.6666870117188, + "epoch": 0.13597560975609757, + "grad_norm": 0.1350649512899473, + "kl": 0.07177734375, + "learning_rate": 2.988188052511774e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 892 + }, + { + "completion_length": 2929.0, + "epoch": 0.13612804878048781, + "grad_norm": 0.3663360236701383, + "kl": 0.0692138671875, + "learning_rate": 2.988087872165111e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 893 + }, + { + "completion_length": 2849.3333740234375, + "epoch": 0.13628048780487806, + "grad_norm": 0.13938156774357527, + "kl": 0.0679931640625, + "learning_rate": 2.9879872704756113e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 894 + }, + { + "completion_length": 1371.0000305175781, + "epoch": 0.1364329268292683, + "grad_norm": 0.154629807356749, + "kl": 0.077392578125, + "learning_rate": 2.98788624747176e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 895 + }, + { + "completion_length": 1676.3333435058594, + "epoch": 0.13658536585365855, + "grad_norm": 1.21345051969234, + "kl": 0.0703125, + "learning_rate": 2.987784803182161e-06, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 896 + }, + { + "completion_length": 2844.1666870117188, + "epoch": 0.1367378048780488, + "grad_norm": 0.11845621016570652, + "kl": 0.052734375, + "learning_rate": 2.987682937635537e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 897 + }, + { + "completion_length": 1937.5000610351562, + "epoch": 0.13689024390243903, + "grad_norm": 0.15094046767551336, + "kl": 0.093505859375, + "learning_rate": 2.9875806508607315e-06, + "loss": 0.0037, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 898 + }, + { + "completion_length": 2897.83349609375, + "epoch": 0.13704268292682928, + "grad_norm": 0.1338140554326937, + "kl": 0.081298828125, + "learning_rate": 2.9874779428867054e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 899 + }, + { + "completion_length": 2279.166748046875, + "epoch": 0.13719512195121952, + "grad_norm": 0.28996734155305354, + "kl": 0.0697021484375, + "learning_rate": 2.9873748137425413e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 900 + }, + { + "completion_length": 1279.8333740234375, + "epoch": 0.13734756097560977, + "grad_norm": 0.3726552696364079, + "kl": 0.0947265625, + "learning_rate": 2.987271263457438e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 901 + }, + { + "completion_length": 1910.8334350585938, + "epoch": 0.1375, + "grad_norm": 1.7768532822562846, + "kl": 0.072021484375, + "learning_rate": 2.9871672920607156e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 902 + }, + { + "completion_length": 1651.3334350585938, + "epoch": 0.13765243902439026, + "grad_norm": 0.3066170157218148, + "kl": 0.0771484375, + "learning_rate": 2.9870628995818137e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 903 + }, + { + "completion_length": 2402.5000610351562, + "epoch": 0.1378048780487805, + "grad_norm": 0.09465120899669736, + "kl": 0.0548095703125, + "learning_rate": 2.9869580860502894e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 904 + }, + { + "completion_length": 1410.6666870117188, + "epoch": 0.13795731707317074, + "grad_norm": 0.12469297966493845, + "kl": 0.076416015625, + "learning_rate": 2.986852851495821e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 905 + }, + { + "completion_length": 1908.8333740234375, + "epoch": 0.138109756097561, + "grad_norm": 0.1324538311421892, + "kl": 0.093505859375, + "learning_rate": 2.9867471959482033e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 906 + }, + { + "completion_length": 2518.1666870117188, + "epoch": 0.13826219512195123, + "grad_norm": 0.08930873986657624, + "kl": 0.05419921875, + "learning_rate": 2.986641119437354e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 907 + }, + { + "completion_length": 1505.666748046875, + "epoch": 0.13841463414634148, + "grad_norm": 0.11696757011890133, + "kl": 0.070068359375, + "learning_rate": 2.986534621993307e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 908 + }, + { + "completion_length": 1029.5, + "epoch": 0.13856707317073172, + "grad_norm": 0.3392946975770214, + "kl": 0.12060546875, + "learning_rate": 2.9864277036462164e-06, + "loss": 0.0048, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 909 + }, + { + "completion_length": 1334.3333740234375, + "epoch": 0.13871951219512196, + "grad_norm": 1.4701439569226344, + "kl": 0.084228515625, + "learning_rate": 2.9863203644263556e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 910 + }, + { + "completion_length": 1898.5001220703125, + "epoch": 0.1388719512195122, + "grad_norm": 0.11210577843336642, + "kl": 0.06005859375, + "learning_rate": 2.986212604364117e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 911 + }, + { + "completion_length": 1399.8333740234375, + "epoch": 0.13902439024390245, + "grad_norm": 0.35723369767393115, + "kl": 0.094970703125, + "learning_rate": 2.9861044234900125e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 912 + }, + { + "completion_length": 861.6666870117188, + "epoch": 0.1391768292682927, + "grad_norm": 0.1965458953587927, + "kl": 0.10888671875, + "learning_rate": 2.985995821834672e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 913 + }, + { + "completion_length": 1275.5000305175781, + "epoch": 0.13932926829268294, + "grad_norm": 0.18885714271012988, + "kl": 0.067626953125, + "learning_rate": 2.9858867994288466e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 914 + }, + { + "completion_length": 2202.83349609375, + "epoch": 0.13948170731707318, + "grad_norm": 0.12726576184399327, + "kl": 0.0673828125, + "learning_rate": 2.9857773563034045e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 915 + }, + { + "completion_length": 2187.6666870117188, + "epoch": 0.13963414634146343, + "grad_norm": 0.19183357911467286, + "kl": 0.0887451171875, + "learning_rate": 2.9856674924893338e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 916 + }, + { + "completion_length": 990.5, + "epoch": 0.13978658536585367, + "grad_norm": 0.2247232793211306, + "kl": 0.092041015625, + "learning_rate": 2.985557208017742e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 917 + }, + { + "completion_length": 1032.5, + "epoch": 0.13993902439024392, + "grad_norm": 0.10333151920248819, + "kl": 0.0550537109375, + "learning_rate": 2.985446502919855e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 918 + }, + { + "completion_length": 1842.5000610351562, + "epoch": 0.14009146341463416, + "grad_norm": 0.25247768276207166, + "kl": 0.07373046875, + "learning_rate": 2.9853353772270196e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 919 + }, + { + "completion_length": 897.8333435058594, + "epoch": 0.1402439024390244, + "grad_norm": 0.1537360621286506, + "kl": 0.06640625, + "learning_rate": 2.985223830970699e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 920 + }, + { + "completion_length": 1133.8333740234375, + "epoch": 0.14039634146341465, + "grad_norm": 0.17439000434206703, + "kl": 0.079833984375, + "learning_rate": 2.985111864182477e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 921 + }, + { + "completion_length": 1216.3333740234375, + "epoch": 0.1405487804878049, + "grad_norm": 0.14114670579069297, + "kl": 0.0628662109375, + "learning_rate": 2.984999476894057e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 922 + }, + { + "completion_length": 1165.8333740234375, + "epoch": 0.1407012195121951, + "grad_norm": 0.22741573393076064, + "kl": 0.10546875, + "learning_rate": 2.9848866691372602e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 923 + }, + { + "completion_length": 992.6666870117188, + "epoch": 0.14085365853658535, + "grad_norm": 0.19164494447360717, + "kl": 0.085693359375, + "learning_rate": 2.984773440944027e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 924 + }, + { + "completion_length": 1471.3333740234375, + "epoch": 0.1410060975609756, + "grad_norm": 0.17859618266638816, + "kl": 0.0928955078125, + "learning_rate": 2.9846597923464185e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 925 + }, + { + "completion_length": 1584.5000610351562, + "epoch": 0.14115853658536584, + "grad_norm": 0.10462406350151876, + "kl": 0.06103515625, + "learning_rate": 2.984545723376613e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 926 + }, + { + "completion_length": 745.3333740234375, + "epoch": 0.14131097560975608, + "grad_norm": 2.287832611232294, + "kl": 0.085693359375, + "learning_rate": 2.9844312340669073e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 927 + }, + { + "completion_length": 882.5, + "epoch": 0.14146341463414633, + "grad_norm": 0.18224820776954057, + "kl": 0.09423828125, + "learning_rate": 2.98431632444972e-06, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 928 + }, + { + "completion_length": 708.0000305175781, + "epoch": 0.14161585365853657, + "grad_norm": 2.041700528510938, + "kl": 0.08544921875, + "learning_rate": 2.9842009945575867e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 929 + }, + { + "completion_length": 1287.5000305175781, + "epoch": 0.14176829268292682, + "grad_norm": 0.13635240067016144, + "kl": 0.0751953125, + "learning_rate": 2.9840852444231613e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 930 + }, + { + "completion_length": 1009.3333740234375, + "epoch": 0.14192073170731706, + "grad_norm": 0.14217417096826035, + "kl": 0.11865234375, + "learning_rate": 2.983969074079219e-06, + "loss": 0.0047, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 931 + }, + { + "completion_length": 2443.0, + "epoch": 0.1420731707317073, + "grad_norm": 0.1798931595474517, + "kl": 0.0673828125, + "learning_rate": 2.983852483558652e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 932 + }, + { + "completion_length": 1140.1667175292969, + "epoch": 0.14222560975609755, + "grad_norm": 0.25146556621745697, + "kl": 0.0924072265625, + "learning_rate": 2.983735472894473e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 933 + }, + { + "completion_length": 1803.5, + "epoch": 0.1423780487804878, + "grad_norm": 0.28134252878307103, + "kl": 0.075439453125, + "learning_rate": 2.9836180421198112e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 934 + }, + { + "completion_length": 1190.1666870117188, + "epoch": 0.14253048780487804, + "grad_norm": 0.315740899958754, + "kl": 0.109375, + "learning_rate": 2.983500191267918e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 935 + }, + { + "completion_length": 906.5, + "epoch": 0.14268292682926828, + "grad_norm": 0.12961024238596539, + "kl": 0.0888671875, + "learning_rate": 2.9833819203721614e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 936 + }, + { + "completion_length": 1172.6667175292969, + "epoch": 0.14283536585365852, + "grad_norm": 0.30722439590056083, + "kl": 0.093017578125, + "learning_rate": 2.9832632294660293e-06, + "loss": 0.0037, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 937 + }, + { + "completion_length": 1076.3333740234375, + "epoch": 0.14298780487804877, + "grad_norm": 0.19029377792931815, + "kl": 0.080322265625, + "learning_rate": 2.983144118583128e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 938 + }, + { + "completion_length": 685.6666870117188, + "epoch": 0.143140243902439, + "grad_norm": 0.13975405883507971, + "kl": 0.0689697265625, + "learning_rate": 2.9830245877571835e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 939 + }, + { + "completion_length": 1220.6666870117188, + "epoch": 0.14329268292682926, + "grad_norm": 0.15295402794801966, + "kl": 0.08544921875, + "learning_rate": 2.98290463702204e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 940 + }, + { + "completion_length": 840.0, + "epoch": 0.1434451219512195, + "grad_norm": 0.4142125841905517, + "kl": 0.090087890625, + "learning_rate": 2.9827842664116596e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 941 + }, + { + "completion_length": 890.0, + "epoch": 0.14359756097560974, + "grad_norm": 0.9938786119904239, + "kl": 0.13037109375, + "learning_rate": 2.9826634759601267e-06, + "loss": 0.0052, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 942 + }, + { + "completion_length": 2209.1666870117188, + "epoch": 0.14375, + "grad_norm": 1.143476237481132, + "kl": 0.05078125, + "learning_rate": 2.982542265701641e-06, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 943 + }, + { + "completion_length": 568.5000152587891, + "epoch": 0.14390243902439023, + "grad_norm": 0.16679524488057623, + "kl": 0.097412109375, + "learning_rate": 2.982420635670523e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 944 + }, + { + "completion_length": 873.5000305175781, + "epoch": 0.14405487804878048, + "grad_norm": 0.14823696356766017, + "kl": 0.0888671875, + "learning_rate": 2.9822985859012105e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 945 + }, + { + "completion_length": 2815.0001220703125, + "epoch": 0.14420731707317072, + "grad_norm": 0.10300958227156132, + "kl": 0.0733642578125, + "learning_rate": 2.982176116428262e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 946 + }, + { + "completion_length": 1499.8333740234375, + "epoch": 0.14435975609756097, + "grad_norm": 0.17350883935569808, + "kl": 0.0771484375, + "learning_rate": 2.9820532272863544e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 947 + }, + { + "completion_length": 835.6666870117188, + "epoch": 0.1445121951219512, + "grad_norm": 0.14667480479476808, + "kl": 0.0732421875, + "learning_rate": 2.9819299185102824e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 948 + }, + { + "completion_length": 2120.6666870117188, + "epoch": 0.14466463414634145, + "grad_norm": 0.10720887433999687, + "kl": 0.077880859375, + "learning_rate": 2.9818061901349597e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 949 + }, + { + "completion_length": 801.1666870117188, + "epoch": 0.1448170731707317, + "grad_norm": 0.1362235178876044, + "kl": 0.121337890625, + "learning_rate": 2.9816820421954194e-06, + "loss": 0.0048, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 950 + }, + { + "completion_length": 1111.1667175292969, + "epoch": 0.14496951219512194, + "grad_norm": 0.1646369754202521, + "kl": 0.0908203125, + "learning_rate": 2.981557474726814e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 951 + }, + { + "completion_length": 1185.666748046875, + "epoch": 0.14512195121951219, + "grad_norm": 0.14892241150224184, + "kl": 0.092041015625, + "learning_rate": 2.981432487764413e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 952 + }, + { + "completion_length": 897.1667175292969, + "epoch": 0.14527439024390243, + "grad_norm": 0.26786371748957366, + "kl": 0.0693359375, + "learning_rate": 2.9813070813436064e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 953 + }, + { + "completion_length": 620.8333435058594, + "epoch": 0.14542682926829267, + "grad_norm": 0.23320216748682943, + "kl": 0.08642578125, + "learning_rate": 2.981181255499902e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 954 + }, + { + "completion_length": 1397.5, + "epoch": 0.14557926829268292, + "grad_norm": 0.49080450829232963, + "kl": 0.094482421875, + "learning_rate": 2.9810550102689264e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 955 + }, + { + "completion_length": 647.8333435058594, + "epoch": 0.14573170731707316, + "grad_norm": 0.17265523948210718, + "kl": 0.094970703125, + "learning_rate": 2.9809283456864257e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 956 + }, + { + "completion_length": 1911.8333740234375, + "epoch": 0.1458841463414634, + "grad_norm": 2.089020404534126, + "kl": 0.0615234375, + "learning_rate": 2.9808012617882634e-06, + "loss": 0.0025, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 957 + }, + { + "completion_length": 745.0, + "epoch": 0.14603658536585365, + "grad_norm": 0.2598153099752304, + "kl": 0.085205078125, + "learning_rate": 2.980673758610423e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 958 + }, + { + "completion_length": 1596.5000610351562, + "epoch": 0.1461890243902439, + "grad_norm": 0.11052101365309595, + "kl": 0.07275390625, + "learning_rate": 2.9805458361890064e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 959 + }, + { + "completion_length": 1310.8333740234375, + "epoch": 0.14634146341463414, + "grad_norm": 0.14353289501669997, + "kl": 0.065185546875, + "learning_rate": 2.980417494560234e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 960 + }, + { + "completion_length": 1915.1667175292969, + "epoch": 0.14649390243902438, + "grad_norm": 0.13452229782888317, + "kl": 0.08349609375, + "learning_rate": 2.9802887337604443e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 961 + }, + { + "completion_length": 3087.3333740234375, + "epoch": 0.14664634146341463, + "grad_norm": 0.06869727353755008, + "kl": 0.0606689453125, + "learning_rate": 2.980159553826096e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 962 + }, + { + "completion_length": 824.3333435058594, + "epoch": 0.14679878048780487, + "grad_norm": 1.167572974301665, + "kl": 0.10205078125, + "learning_rate": 2.980029954793765e-06, + "loss": 0.0041, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 963 + }, + { + "completion_length": 2711.83349609375, + "epoch": 0.1469512195121951, + "grad_norm": 0.06514188719497706, + "kl": 0.05810546875, + "learning_rate": 2.9798999367001467e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 964 + }, + { + "completion_length": 1956.8334350585938, + "epoch": 0.14710365853658536, + "grad_norm": 0.160859850096583, + "kl": 0.06494140625, + "learning_rate": 2.979769499582054e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 965 + }, + { + "completion_length": 3814.666748046875, + "epoch": 0.1472560975609756, + "grad_norm": 0.051528998416075576, + "kl": 0.0443115234375, + "learning_rate": 2.9796386434764216e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 966 + }, + { + "completion_length": 2882.0, + "epoch": 0.14740853658536585, + "grad_norm": 0.10280099081214292, + "kl": 0.0712890625, + "learning_rate": 2.9795073684202987e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 967 + }, + { + "completion_length": 1780.8333740234375, + "epoch": 0.1475609756097561, + "grad_norm": 0.1763476872852246, + "kl": 0.0633544921875, + "learning_rate": 2.979375674450855e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 968 + }, + { + "completion_length": 3441.3333740234375, + "epoch": 0.14771341463414633, + "grad_norm": 0.07969291697621575, + "kl": 0.0596923828125, + "learning_rate": 2.97924356160538e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 969 + }, + { + "completion_length": 3109.166748046875, + "epoch": 0.14786585365853658, + "grad_norm": 0.12709483693797757, + "kl": 0.058349609375, + "learning_rate": 2.9791110299212797e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 970 + }, + { + "completion_length": 2092.0001220703125, + "epoch": 0.14801829268292682, + "grad_norm": 0.09639043742348041, + "kl": 0.074951171875, + "learning_rate": 2.9789780794360796e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 971 + }, + { + "completion_length": 3462.666748046875, + "epoch": 0.14817073170731707, + "grad_norm": 0.058174620313857124, + "kl": 0.057861328125, + "learning_rate": 2.9788447101874246e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 972 + }, + { + "completion_length": 2939.666748046875, + "epoch": 0.1483231707317073, + "grad_norm": 0.12369885853075817, + "kl": 0.05615234375, + "learning_rate": 2.978710922213077e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 973 + }, + { + "completion_length": 3030.666748046875, + "epoch": 0.14847560975609755, + "grad_norm": 0.07855111781897468, + "kl": 0.060546875, + "learning_rate": 2.9785767155509175e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 974 + }, + { + "completion_length": 2165.8333740234375, + "epoch": 0.1486280487804878, + "grad_norm": 0.10388689473971864, + "kl": 0.072021484375, + "learning_rate": 2.9784420902389465e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 975 + }, + { + "completion_length": 1295.1666870117188, + "epoch": 0.14878048780487804, + "grad_norm": 0.13304799238202122, + "kl": 0.074951171875, + "learning_rate": 2.9783070463152816e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 976 + }, + { + "completion_length": 3164.8333740234375, + "epoch": 0.1489329268292683, + "grad_norm": 0.06269767535896315, + "kl": 0.05224609375, + "learning_rate": 2.978171583818161e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 977 + }, + { + "completion_length": 2090.5, + "epoch": 0.14908536585365853, + "grad_norm": 1.8657385311128156, + "kl": 0.0791015625, + "learning_rate": 2.978035702785939e-06, + "loss": 0.0032, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 978 + }, + { + "completion_length": 2785.83349609375, + "epoch": 0.14923780487804877, + "grad_norm": 0.06926632387618296, + "kl": 0.065673828125, + "learning_rate": 2.9778994032570893e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 979 + }, + { + "completion_length": 1892.6666870117188, + "epoch": 0.14939024390243902, + "grad_norm": 0.10359623988360156, + "kl": 0.080810546875, + "learning_rate": 2.977762685270205e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 980 + }, + { + "completion_length": 3541.3333740234375, + "epoch": 0.14954268292682926, + "grad_norm": 0.06313110414328532, + "kl": 0.072998046875, + "learning_rate": 2.9776255488639965e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 981 + }, + { + "completion_length": 2402.0000610351562, + "epoch": 0.1496951219512195, + "grad_norm": 0.21855410179774748, + "kl": 0.072265625, + "learning_rate": 2.977487994077293e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 982 + }, + { + "completion_length": 1311.3333740234375, + "epoch": 0.14984756097560975, + "grad_norm": 0.154617104691706, + "kl": 0.079345703125, + "learning_rate": 2.9773500209490433e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 983 + }, + { + "completion_length": 1609.3333740234375, + "epoch": 0.15, + "grad_norm": 0.07568790151120655, + "kl": 0.074951171875, + "learning_rate": 2.9772116295183124e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 984 + }, + { + "completion_length": 819.5, + "epoch": 0.15015243902439024, + "grad_norm": 0.196720686603035, + "kl": 0.119384765625, + "learning_rate": 2.977072819824285e-06, + "loss": 0.0048, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 985 + }, + { + "completion_length": 2507.0, + "epoch": 0.15030487804878048, + "grad_norm": 0.09075318636500998, + "kl": 0.07421875, + "learning_rate": 2.9769335919062653e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 986 + }, + { + "completion_length": 1411.8333740234375, + "epoch": 0.15045731707317073, + "grad_norm": 0.09485957179686975, + "kl": 0.0537109375, + "learning_rate": 2.9767939458036737e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 987 + }, + { + "completion_length": 890.3333740234375, + "epoch": 0.15060975609756097, + "grad_norm": 0.1319637485163045, + "kl": 0.078857421875, + "learning_rate": 2.976653881556051e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 988 + }, + { + "completion_length": 2500.5, + "epoch": 0.15076219512195121, + "grad_norm": 0.09967308656050486, + "kl": 0.064697265625, + "learning_rate": 2.9765133992030543e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 989 + }, + { + "completion_length": 1074.0, + "epoch": 0.15091463414634146, + "grad_norm": 0.08387467194348726, + "kl": 0.06982421875, + "learning_rate": 2.9763724987844617e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 990 + }, + { + "completion_length": 1815.3333740234375, + "epoch": 0.1510670731707317, + "grad_norm": 0.07446190738955732, + "kl": 0.072998046875, + "learning_rate": 2.976231180340168e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 991 + }, + { + "completion_length": 1917.6666717529297, + "epoch": 0.15121951219512195, + "grad_norm": 0.20081305255055543, + "kl": 0.0811767578125, + "learning_rate": 2.9760894439101857e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 992 + }, + { + "completion_length": 1530.5, + "epoch": 0.1513719512195122, + "grad_norm": 0.08149072719695079, + "kl": 0.074951171875, + "learning_rate": 2.9759472895346477e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 993 + }, + { + "completion_length": 1734.3333740234375, + "epoch": 0.15152439024390243, + "grad_norm": 0.1445202317495466, + "kl": 0.069091796875, + "learning_rate": 2.9758047172538033e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 994 + }, + { + "completion_length": 1595.1666870117188, + "epoch": 0.15167682926829268, + "grad_norm": 1.6690827635002332, + "kl": 0.11279296875, + "learning_rate": 2.9756617271080216e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 995 + }, + { + "completion_length": 1698.8333435058594, + "epoch": 0.15182926829268292, + "grad_norm": 0.11057750974472705, + "kl": 0.070068359375, + "learning_rate": 2.9755183191377888e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 996 + }, + { + "completion_length": 1437.0000610351562, + "epoch": 0.15198170731707317, + "grad_norm": 0.11954266706584818, + "kl": 0.07470703125, + "learning_rate": 2.97537449338371e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 997 + }, + { + "completion_length": 1721.5, + "epoch": 0.1521341463414634, + "grad_norm": 0.1863044246598857, + "kl": 0.072509765625, + "learning_rate": 2.975230249886509e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 998 + }, + { + "completion_length": 2633.3334350585938, + "epoch": 0.15228658536585366, + "grad_norm": 0.07296050206431708, + "kl": 0.076416015625, + "learning_rate": 2.975085588687028e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 999 + }, + { + "completion_length": 2023.5001220703125, + "epoch": 0.1524390243902439, + "grad_norm": 0.09826530573400344, + "kl": 0.0667724609375, + "learning_rate": 2.974940509826225e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1000 + }, + { + "completion_length": 2199.5000610351562, + "epoch": 0.15259146341463414, + "grad_norm": 0.11570500021478004, + "kl": 0.070068359375, + "learning_rate": 2.9747950133451802e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1001 + }, + { + "completion_length": 2483.666748046875, + "epoch": 0.1527439024390244, + "grad_norm": 0.07297126869953213, + "kl": 0.057861328125, + "learning_rate": 2.974649099285089e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1002 + }, + { + "completion_length": 1558.166748046875, + "epoch": 0.15289634146341463, + "grad_norm": 0.09660761209644958, + "kl": 0.0614013671875, + "learning_rate": 2.974502767687266e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1003 + }, + { + "completion_length": 2511.8333740234375, + "epoch": 0.15304878048780488, + "grad_norm": 0.06263370549137114, + "kl": 0.072998046875, + "learning_rate": 2.9743560185931443e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1004 + }, + { + "completion_length": 1759.3333740234375, + "epoch": 0.15320121951219512, + "grad_norm": 0.08556768696581724, + "kl": 0.0504150390625, + "learning_rate": 2.974208852044275e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1005 + }, + { + "completion_length": 1331.6667175292969, + "epoch": 0.15335365853658536, + "grad_norm": 0.09282773307431855, + "kl": 0.0673828125, + "learning_rate": 2.974061268082328e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1006 + }, + { + "completion_length": 1818.3334350585938, + "epoch": 0.1535060975609756, + "grad_norm": 0.07710304256913801, + "kl": 0.052490234375, + "learning_rate": 2.9739132667490898e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1007 + }, + { + "completion_length": 1197.5000610351562, + "epoch": 0.15365853658536585, + "grad_norm": 0.17074418742739444, + "kl": 0.097412109375, + "learning_rate": 2.973764848086466e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1008 + }, + { + "completion_length": 1819.0000610351562, + "epoch": 0.1538109756097561, + "grad_norm": 0.11306816837769314, + "kl": 0.0689697265625, + "learning_rate": 2.973616012136482e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1009 + }, + { + "completion_length": 2251.3333740234375, + "epoch": 0.15396341463414634, + "grad_norm": 0.10446151305410824, + "kl": 0.06689453125, + "learning_rate": 2.973466758941278e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1010 + }, + { + "completion_length": 1311.6666870117188, + "epoch": 0.15411585365853658, + "grad_norm": 0.1268250351176635, + "kl": 0.090576171875, + "learning_rate": 2.973317088543115e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1011 + }, + { + "completion_length": 1554.3334350585938, + "epoch": 0.15426829268292683, + "grad_norm": 0.14139533655178732, + "kl": 0.077880859375, + "learning_rate": 2.9731670009843704e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1012 + }, + { + "completion_length": 1290.0, + "epoch": 0.15442073170731707, + "grad_norm": 0.22302143210851125, + "kl": 0.0673828125, + "learning_rate": 2.973016496307542e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1013 + }, + { + "completion_length": 1747.0000610351562, + "epoch": 0.15457317073170732, + "grad_norm": 0.15022443937677754, + "kl": 0.06982421875, + "learning_rate": 2.972865574555243e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1014 + }, + { + "completion_length": 2329.0, + "epoch": 0.15472560975609756, + "grad_norm": 0.06588897533985752, + "kl": 0.0595703125, + "learning_rate": 2.9727142357702062e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1015 + }, + { + "completion_length": 1544.3333740234375, + "epoch": 0.1548780487804878, + "grad_norm": 0.12122259689461896, + "kl": 0.07080078125, + "learning_rate": 2.9725624799952824e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1016 + }, + { + "completion_length": 1139.5000610351562, + "epoch": 0.15503048780487805, + "grad_norm": 0.10305438114424685, + "kl": 0.06689453125, + "learning_rate": 2.9724103072734404e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1017 + }, + { + "completion_length": 1470.5000610351562, + "epoch": 0.1551829268292683, + "grad_norm": 0.14796605158722495, + "kl": 0.057861328125, + "learning_rate": 2.9722577176477673e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1018 + }, + { + "completion_length": 2327.3333740234375, + "epoch": 0.15533536585365854, + "grad_norm": 0.06286233704539616, + "kl": 0.052490234375, + "learning_rate": 2.9721047111614675e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1019 + }, + { + "completion_length": 1152.3333435058594, + "epoch": 0.15548780487804878, + "grad_norm": 2.2290672728164904, + "kl": 0.08544921875, + "learning_rate": 2.971951287857863e-06, + "loss": 0.0034, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1020 + }, + { + "completion_length": 1086.8333435058594, + "epoch": 0.15564024390243902, + "grad_norm": 0.16044299600337134, + "kl": 0.0655517578125, + "learning_rate": 2.9717974477803963e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1021 + }, + { + "completion_length": 1476.666748046875, + "epoch": 0.15579268292682927, + "grad_norm": 0.15906227197012998, + "kl": 0.0606689453125, + "learning_rate": 2.9716431909726257e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1022 + }, + { + "completion_length": 2381.83349609375, + "epoch": 0.1559451219512195, + "grad_norm": 0.07937931794682637, + "kl": 0.05419921875, + "learning_rate": 2.971488517478227e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1023 + }, + { + "completion_length": 2028.666748046875, + "epoch": 0.15609756097560976, + "grad_norm": 0.06836163125223726, + "kl": 0.059814453125, + "learning_rate": 2.9713334273409965e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1024 + }, + { + "completion_length": 2871.3333740234375, + "epoch": 0.15625, + "grad_norm": 0.07330625207543286, + "kl": 0.056396484375, + "learning_rate": 2.971177920604846e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1025 + }, + { + "completion_length": 2523.666748046875, + "epoch": 0.15640243902439024, + "grad_norm": 0.107579577051837, + "kl": 0.059326171875, + "learning_rate": 2.9710219973138063e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1026 + }, + { + "completion_length": 2599.666748046875, + "epoch": 0.1565548780487805, + "grad_norm": 0.0671054569781134, + "kl": 0.051025390625, + "learning_rate": 2.970865657512027e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1027 + }, + { + "completion_length": 2592.3333740234375, + "epoch": 0.15670731707317073, + "grad_norm": 0.07526508450861857, + "kl": 0.0643310546875, + "learning_rate": 2.970708901243774e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1028 + }, + { + "completion_length": 1908.8333740234375, + "epoch": 0.15685975609756098, + "grad_norm": 0.20419999611657574, + "kl": 0.075927734375, + "learning_rate": 2.970551728553432e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1029 + }, + { + "completion_length": 1866.3334350585938, + "epoch": 0.15701219512195122, + "grad_norm": 0.1058061283736273, + "kl": 0.0513916015625, + "learning_rate": 2.9703941394855036e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1030 + }, + { + "completion_length": 1717.166748046875, + "epoch": 0.15716463414634146, + "grad_norm": 0.0710703404405416, + "kl": 0.0648193359375, + "learning_rate": 2.9702361340846092e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1031 + }, + { + "completion_length": 1020.5000305175781, + "epoch": 0.1573170731707317, + "grad_norm": 0.10925959886705108, + "kl": 0.0751953125, + "learning_rate": 2.9700777123954867e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1032 + }, + { + "completion_length": 1391.5, + "epoch": 0.15746951219512195, + "grad_norm": 0.11075307052434424, + "kl": 0.065185546875, + "learning_rate": 2.9699188744629922e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1033 + }, + { + "completion_length": 1251.3333435058594, + "epoch": 0.1576219512195122, + "grad_norm": 0.10335246826105339, + "kl": 0.0665283203125, + "learning_rate": 2.9697596203321004e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1034 + }, + { + "completion_length": 1034.5000610351562, + "epoch": 0.15777439024390244, + "grad_norm": 0.15851901440224378, + "kl": 0.08935546875, + "learning_rate": 2.969599950047902e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1035 + }, + { + "completion_length": 1901.5000610351562, + "epoch": 0.15792682926829268, + "grad_norm": 0.09164162814028888, + "kl": 0.08544921875, + "learning_rate": 2.969439863655608e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1036 + }, + { + "completion_length": 1936.6666870117188, + "epoch": 0.15807926829268293, + "grad_norm": 0.09862863970472353, + "kl": 0.078125, + "learning_rate": 2.969279361200545e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1037 + }, + { + "completion_length": 1537.6666870117188, + "epoch": 0.15823170731707317, + "grad_norm": 0.07951591745195265, + "kl": 0.064453125, + "learning_rate": 2.969118442728158e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1038 + }, + { + "completion_length": 2364.0000610351562, + "epoch": 0.15838414634146342, + "grad_norm": 0.07510280156892132, + "kl": 0.0517578125, + "learning_rate": 2.968957108284011e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1039 + }, + { + "completion_length": 1227.0000305175781, + "epoch": 0.15853658536585366, + "grad_norm": 0.36609572592407635, + "kl": 0.08642578125, + "learning_rate": 2.968795357913784e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1040 + }, + { + "completion_length": 2834.166748046875, + "epoch": 0.1586890243902439, + "grad_norm": 0.08770101046078066, + "kl": 0.0489501953125, + "learning_rate": 2.9686331916632764e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1041 + }, + { + "completion_length": 1197.1666870117188, + "epoch": 0.15884146341463415, + "grad_norm": 0.13735877904944624, + "kl": 0.07470703125, + "learning_rate": 2.9684706095784037e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1042 + }, + { + "completion_length": 1754.0000610351562, + "epoch": 0.1589939024390244, + "grad_norm": 0.09263567835610703, + "kl": 0.0521240234375, + "learning_rate": 2.968307611705201e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1043 + }, + { + "completion_length": 1207.1666717529297, + "epoch": 0.15914634146341464, + "grad_norm": 0.30946123563663036, + "kl": 0.106201171875, + "learning_rate": 2.968144198089819e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1044 + }, + { + "completion_length": 2047.8333740234375, + "epoch": 0.15929878048780488, + "grad_norm": 0.08467065070603982, + "kl": 0.059814453125, + "learning_rate": 2.967980368778528e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1045 + }, + { + "completion_length": 1006.1666870117188, + "epoch": 0.15945121951219512, + "grad_norm": 0.14378623125860196, + "kl": 0.06591796875, + "learning_rate": 2.9678161238177145e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1046 + }, + { + "completion_length": 1030.1667175292969, + "epoch": 0.15960365853658537, + "grad_norm": 0.1295714479234479, + "kl": 0.0740966796875, + "learning_rate": 2.9676514632538848e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1047 + }, + { + "completion_length": 1609.1666717529297, + "epoch": 0.1597560975609756, + "grad_norm": 0.10280812047845218, + "kl": 0.0506591796875, + "learning_rate": 2.9674863871336603e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1048 + }, + { + "completion_length": 2152.5, + "epoch": 0.15990853658536586, + "grad_norm": 0.1386754615794903, + "kl": 0.057373046875, + "learning_rate": 2.9673208955037818e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1049 + }, + { + "completion_length": 991.0, + "epoch": 0.1600609756097561, + "grad_norm": 0.13662131215635473, + "kl": 0.07763671875, + "learning_rate": 2.967154988411107e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1050 + }, + { + "completion_length": 2590.166748046875, + "epoch": 0.16021341463414634, + "grad_norm": 0.08349908885882013, + "kl": 0.0657958984375, + "learning_rate": 2.966988665902612e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1051 + }, + { + "completion_length": 1162.5000305175781, + "epoch": 0.1603658536585366, + "grad_norm": 0.12419639829230607, + "kl": 0.06103515625, + "learning_rate": 2.966821928025389e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1052 + }, + { + "completion_length": 1187.5, + "epoch": 0.16051829268292683, + "grad_norm": 0.11669969398930297, + "kl": 0.071533203125, + "learning_rate": 2.9666547748266495e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1053 + }, + { + "completion_length": 1880.0000610351562, + "epoch": 0.16067073170731708, + "grad_norm": 0.2875205053384481, + "kl": 0.068603515625, + "learning_rate": 2.9664872063537217e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1054 + }, + { + "completion_length": 1694.666748046875, + "epoch": 0.16082317073170732, + "grad_norm": 0.512478486900961, + "kl": 0.11376953125, + "learning_rate": 2.966319222654052e-06, + "loss": 0.0046, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1055 + }, + { + "completion_length": 1170.6666870117188, + "epoch": 0.16097560975609757, + "grad_norm": 0.10190806018806456, + "kl": 0.07958984375, + "learning_rate": 2.9661508237752034e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1056 + }, + { + "completion_length": 1354.3334350585938, + "epoch": 0.1611280487804878, + "grad_norm": 0.11521260885667245, + "kl": 0.083740234375, + "learning_rate": 2.9659820097648567e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1057 + }, + { + "completion_length": 1488.5, + "epoch": 0.16128048780487805, + "grad_norm": 0.09408845677374726, + "kl": 0.0574951171875, + "learning_rate": 2.9658127806708114e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1058 + }, + { + "completion_length": 1577.166748046875, + "epoch": 0.1614329268292683, + "grad_norm": 0.49963575927929127, + "kl": 0.10107421875, + "learning_rate": 2.9656431365409837e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1059 + }, + { + "completion_length": 1289.1667175292969, + "epoch": 0.16158536585365854, + "grad_norm": 0.08602820572671178, + "kl": 0.08056640625, + "learning_rate": 2.9654730774234067e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1060 + }, + { + "completion_length": 1382.3333740234375, + "epoch": 0.16173780487804879, + "grad_norm": 0.15919577926762302, + "kl": 0.097412109375, + "learning_rate": 2.9653026033662312e-06, + "loss": 0.0039, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1061 + }, + { + "completion_length": 1348.5, + "epoch": 0.16189024390243903, + "grad_norm": 0.10211633203852008, + "kl": 0.0667724609375, + "learning_rate": 2.9651317144177264e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1062 + }, + { + "completion_length": 1547.3334350585938, + "epoch": 0.16204268292682927, + "grad_norm": 0.09838993847121717, + "kl": 0.0645751953125, + "learning_rate": 2.9649604106262785e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1063 + }, + { + "completion_length": 2555.0001220703125, + "epoch": 0.16219512195121952, + "grad_norm": 0.08125608456290742, + "kl": 0.0653076171875, + "learning_rate": 2.9647886920403916e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1064 + }, + { + "completion_length": 1341.1666870117188, + "epoch": 0.16234756097560976, + "grad_norm": 0.06937020223525167, + "kl": 0.06298828125, + "learning_rate": 2.964616558708686e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1065 + }, + { + "completion_length": 1838.0, + "epoch": 0.1625, + "grad_norm": 0.1035760365473097, + "kl": 0.080810546875, + "learning_rate": 2.9644440106799e-06, + "loss": 0.0032, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1066 + }, + { + "completion_length": 3276.8333740234375, + "epoch": 0.16265243902439025, + "grad_norm": 0.305052783961104, + "kl": 0.054443359375, + "learning_rate": 2.9642710480028902e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1067 + }, + { + "completion_length": 2274.5000610351562, + "epoch": 0.1628048780487805, + "grad_norm": 0.07418223677054597, + "kl": 0.0643310546875, + "learning_rate": 2.9640976707266297e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1068 + }, + { + "completion_length": 2700.666748046875, + "epoch": 0.16295731707317074, + "grad_norm": 0.09312512759380724, + "kl": 0.0560302734375, + "learning_rate": 2.963923878900209e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1069 + }, + { + "completion_length": 1436.0, + "epoch": 0.16310975609756098, + "grad_norm": 0.17556857813224863, + "kl": 0.07666015625, + "learning_rate": 2.963749672572836e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1070 + }, + { + "completion_length": 1020.1666870117188, + "epoch": 0.16326219512195123, + "grad_norm": 0.21323528952618923, + "kl": 0.0830078125, + "learning_rate": 2.9635750517938364e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1071 + }, + { + "completion_length": 1424.8333740234375, + "epoch": 0.16341463414634147, + "grad_norm": 0.07181175678923313, + "kl": 0.068359375, + "learning_rate": 2.9634000166126534e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1072 + }, + { + "completion_length": 1350.3333740234375, + "epoch": 0.1635670731707317, + "grad_norm": 0.09063042819398044, + "kl": 0.079345703125, + "learning_rate": 2.9632245670788466e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1073 + }, + { + "completion_length": 1276.8333740234375, + "epoch": 0.16371951219512196, + "grad_norm": 0.09328637220043817, + "kl": 0.07763671875, + "learning_rate": 2.9630487032420935e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1074 + }, + { + "completion_length": 2231.5000610351562, + "epoch": 0.1638719512195122, + "grad_norm": 0.08883868280366074, + "kl": 0.071044921875, + "learning_rate": 2.9628724251521892e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1075 + }, + { + "completion_length": 1943.166748046875, + "epoch": 0.16402439024390245, + "grad_norm": 0.07658145117362243, + "kl": 0.080322265625, + "learning_rate": 2.962695732859045e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1076 + }, + { + "completion_length": 1867.3333740234375, + "epoch": 0.1641768292682927, + "grad_norm": 0.21298345079456718, + "kl": 0.083251953125, + "learning_rate": 2.9625186264126906e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1077 + }, + { + "completion_length": 2152.0001220703125, + "epoch": 0.16432926829268293, + "grad_norm": 0.10178335686564187, + "kl": 0.0751953125, + "learning_rate": 2.9623411058632726e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1078 + }, + { + "completion_length": 1664.5, + "epoch": 0.16448170731707318, + "grad_norm": 0.3927309803253449, + "kl": 0.275634765625, + "learning_rate": 2.9621631712610557e-06, + "loss": 0.011, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1079 + }, + { + "completion_length": 1837.1666870117188, + "epoch": 0.16463414634146342, + "grad_norm": 0.08695298292862444, + "kl": 0.0682373046875, + "learning_rate": 2.9619848226564196e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1080 + }, + { + "completion_length": 2958.5001220703125, + "epoch": 0.16478658536585367, + "grad_norm": 2.8832888911063774, + "kl": 0.15087890625, + "learning_rate": 2.9618060600998636e-06, + "loss": 0.006, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1081 + }, + { + "completion_length": 1332.5000610351562, + "epoch": 0.1649390243902439, + "grad_norm": 0.0837593330629733, + "kl": 0.0560302734375, + "learning_rate": 2.9616268836420026e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1082 + }, + { + "completion_length": 1795.3333740234375, + "epoch": 0.16509146341463415, + "grad_norm": 0.11971679150228222, + "kl": 0.0765380859375, + "learning_rate": 2.9614472933335693e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1083 + }, + { + "completion_length": 2274.6666870117188, + "epoch": 0.1652439024390244, + "grad_norm": 0.06069163445277905, + "kl": 0.0595703125, + "learning_rate": 2.961267289225414e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1084 + }, + { + "completion_length": 1077.8333740234375, + "epoch": 0.16539634146341464, + "grad_norm": 0.1628410816704752, + "kl": 0.07666015625, + "learning_rate": 2.9610868713685036e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1085 + }, + { + "completion_length": 579.3333435058594, + "epoch": 0.1655487804878049, + "grad_norm": 0.1713712043951878, + "kl": 0.138671875, + "learning_rate": 2.960906039813922e-06, + "loss": 0.0055, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1086 + }, + { + "completion_length": 1397.1666870117188, + "epoch": 0.16570121951219513, + "grad_norm": 0.11230904976354576, + "kl": 0.080322265625, + "learning_rate": 2.960724794612871e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1087 + }, + { + "completion_length": 1438.3333740234375, + "epoch": 0.16585365853658537, + "grad_norm": 0.15070274366249117, + "kl": 0.0767822265625, + "learning_rate": 2.9605431358166687e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1088 + }, + { + "completion_length": 1650.8333740234375, + "epoch": 0.16600609756097562, + "grad_norm": 0.16103450466303018, + "kl": 0.07421875, + "learning_rate": 2.9603610634767508e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1089 + }, + { + "completion_length": 1847.3333740234375, + "epoch": 0.16615853658536586, + "grad_norm": 0.15108757859958186, + "kl": 0.082275390625, + "learning_rate": 2.9601785776446697e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1090 + }, + { + "completion_length": 858.5, + "epoch": 0.1663109756097561, + "grad_norm": 0.18969719602554141, + "kl": 0.0830078125, + "learning_rate": 2.9599956783720955e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1091 + }, + { + "completion_length": 1097.6666870117188, + "epoch": 0.16646341463414635, + "grad_norm": 0.09947748263006291, + "kl": 0.08740234375, + "learning_rate": 2.959812365710815e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1092 + }, + { + "completion_length": 769.6666870117188, + "epoch": 0.1666158536585366, + "grad_norm": 1.801015893888817, + "kl": 0.068115234375, + "learning_rate": 2.959628639712732e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1093 + }, + { + "completion_length": 2196.3333740234375, + "epoch": 0.16676829268292684, + "grad_norm": 0.09852299720029903, + "kl": 0.05517578125, + "learning_rate": 2.9594445004298667e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1094 + }, + { + "completion_length": 1369.0000610351562, + "epoch": 0.16692073170731708, + "grad_norm": 0.14201684860559252, + "kl": 0.06640625, + "learning_rate": 2.959259947914358e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1095 + }, + { + "completion_length": 1128.3333740234375, + "epoch": 0.16707317073170733, + "grad_norm": 0.1257912416266534, + "kl": 0.092529296875, + "learning_rate": 2.9590749822184602e-06, + "loss": 0.0037, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1096 + }, + { + "completion_length": 1722.3333740234375, + "epoch": 0.16722560975609757, + "grad_norm": 0.10035636478318895, + "kl": 0.0880126953125, + "learning_rate": 2.9588896033945452e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1097 + }, + { + "completion_length": 2109.1666870117188, + "epoch": 0.16737804878048781, + "grad_norm": 0.06862284645857424, + "kl": 0.0626220703125, + "learning_rate": 2.9587038114951023e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1098 + }, + { + "completion_length": 1644.666748046875, + "epoch": 0.16753048780487806, + "grad_norm": 0.06884738375770455, + "kl": 0.0670166015625, + "learning_rate": 2.9585176065727373e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1099 + }, + { + "completion_length": 1645.3333740234375, + "epoch": 0.1676829268292683, + "grad_norm": 0.13522048671189174, + "kl": 0.0726318359375, + "learning_rate": 2.958330988680172e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1100 + }, + { + "completion_length": 2622.0, + "epoch": 0.16783536585365855, + "grad_norm": 0.12573569052886543, + "kl": 0.061279296875, + "learning_rate": 2.9581439578702474e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1101 + }, + { + "completion_length": 2033.166748046875, + "epoch": 0.1679878048780488, + "grad_norm": 0.08047163027004844, + "kl": 0.0682373046875, + "learning_rate": 2.957956514195919e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1102 + }, + { + "completion_length": 2790.83349609375, + "epoch": 0.16814024390243903, + "grad_norm": 0.08602987645965457, + "kl": 0.0596923828125, + "learning_rate": 2.9577686577102613e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1103 + }, + { + "completion_length": 2201.666748046875, + "epoch": 0.16829268292682928, + "grad_norm": 0.0797175273351221, + "kl": 0.057861328125, + "learning_rate": 2.9575803884664634e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1104 + }, + { + "completion_length": 1969.3334350585938, + "epoch": 0.16844512195121952, + "grad_norm": 0.06603131411719639, + "kl": 0.070068359375, + "learning_rate": 2.9573917065178345e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1105 + }, + { + "completion_length": 1583.5000610351562, + "epoch": 0.16859756097560977, + "grad_norm": 0.1373046293222806, + "kl": 0.08984375, + "learning_rate": 2.9572026119177967e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1106 + }, + { + "completion_length": 1166.5, + "epoch": 0.16875, + "grad_norm": 0.09633199690967655, + "kl": 0.095703125, + "learning_rate": 2.957013104719892e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1107 + }, + { + "completion_length": 3633.5001220703125, + "epoch": 0.16890243902439026, + "grad_norm": 0.05048746117021704, + "kl": 0.0467529296875, + "learning_rate": 2.9568231849777785e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1108 + }, + { + "completion_length": 2541.1666870117188, + "epoch": 0.1690548780487805, + "grad_norm": 0.08496766035939159, + "kl": 0.07080078125, + "learning_rate": 2.95663285274523e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1109 + }, + { + "completion_length": 1955.0, + "epoch": 0.16920731707317074, + "grad_norm": 0.10111068114019556, + "kl": 0.06640625, + "learning_rate": 2.956442108076138e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1110 + }, + { + "completion_length": 2316.666748046875, + "epoch": 0.169359756097561, + "grad_norm": 0.08487079378028556, + "kl": 0.063720703125, + "learning_rate": 2.956250951024512e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1111 + }, + { + "completion_length": 2536.0000610351562, + "epoch": 0.16951219512195123, + "grad_norm": 0.07999853268634831, + "kl": 0.05810546875, + "learning_rate": 2.9560593816444746e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1112 + }, + { + "completion_length": 3303.8333740234375, + "epoch": 0.16966463414634148, + "grad_norm": 0.08435495530139224, + "kl": 0.0413818359375, + "learning_rate": 2.955867399990269e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1113 + }, + { + "completion_length": 2495.5, + "epoch": 0.16981707317073172, + "grad_norm": 0.06591165663350453, + "kl": 0.066162109375, + "learning_rate": 2.9556750061162537e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1114 + }, + { + "completion_length": 1838.5000610351562, + "epoch": 0.16996951219512196, + "grad_norm": 0.09735618935790304, + "kl": 0.074462890625, + "learning_rate": 2.9554822000769027e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1115 + }, + { + "completion_length": 2227.8333740234375, + "epoch": 0.1701219512195122, + "grad_norm": 0.11738566450456844, + "kl": 0.0628662109375, + "learning_rate": 2.9552889819268095e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1116 + }, + { + "completion_length": 3164.3333740234375, + "epoch": 0.17027439024390245, + "grad_norm": 0.04249418703916415, + "kl": 0.0418701171875, + "learning_rate": 2.955095351720681e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1117 + }, + { + "completion_length": 1885.666748046875, + "epoch": 0.1704268292682927, + "grad_norm": 0.10821125049800663, + "kl": 0.0684814453125, + "learning_rate": 2.9549013095133433e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1118 + }, + { + "completion_length": 828.0, + "epoch": 0.17057926829268294, + "grad_norm": 0.1208379588793967, + "kl": 0.074462890625, + "learning_rate": 2.954706855359738e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1119 + }, + { + "completion_length": 2143.166748046875, + "epoch": 0.17073170731707318, + "grad_norm": 0.08059028881504625, + "kl": 0.08203125, + "learning_rate": 2.9545119893149243e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1120 + }, + { + "completion_length": 1680.0, + "epoch": 0.17088414634146343, + "grad_norm": 0.10784251422453424, + "kl": 0.0623779296875, + "learning_rate": 2.954316711434076e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1121 + }, + { + "completion_length": 1906.666748046875, + "epoch": 0.17103658536585367, + "grad_norm": 0.14761688761951589, + "kl": 0.091796875, + "learning_rate": 2.9541210217724857e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1122 + }, + { + "completion_length": 1957.0, + "epoch": 0.17118902439024392, + "grad_norm": 0.06462048707172895, + "kl": 0.0494384765625, + "learning_rate": 2.9539249203855613e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1123 + }, + { + "completion_length": 1509.666748046875, + "epoch": 0.17134146341463416, + "grad_norm": 0.12470099414270724, + "kl": 0.0615234375, + "learning_rate": 2.953728407328828e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1124 + }, + { + "completion_length": 1922.0000610351562, + "epoch": 0.1714939024390244, + "grad_norm": 0.11254363581966646, + "kl": 0.092529296875, + "learning_rate": 2.9535314826579267e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1125 + }, + { + "completion_length": 2745.83349609375, + "epoch": 0.17164634146341465, + "grad_norm": 0.07319821877403464, + "kl": 0.05419921875, + "learning_rate": 2.953334146428616e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1126 + }, + { + "completion_length": 1714.3333740234375, + "epoch": 0.1717987804878049, + "grad_norm": 0.09286429885628109, + "kl": 0.077880859375, + "learning_rate": 2.9531363986967704e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1127 + }, + { + "completion_length": 2007.6666870117188, + "epoch": 0.1719512195121951, + "grad_norm": 0.07625210965952339, + "kl": 0.0751953125, + "learning_rate": 2.9529382395183812e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1128 + }, + { + "completion_length": 1891.0, + "epoch": 0.17210365853658535, + "grad_norm": 0.05946504377808837, + "kl": 0.0552978515625, + "learning_rate": 2.9527396689495544e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1129 + }, + { + "completion_length": 1811.166748046875, + "epoch": 0.1722560975609756, + "grad_norm": 0.10601238504226931, + "kl": 0.07080078125, + "learning_rate": 2.952540687046516e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1130 + }, + { + "completion_length": 1288.0, + "epoch": 0.17240853658536584, + "grad_norm": 0.11681899559900999, + "kl": 0.05615234375, + "learning_rate": 2.9523412938656057e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1131 + }, + { + "completion_length": 1494.0, + "epoch": 0.17256097560975608, + "grad_norm": 0.08442390013734433, + "kl": 0.07958984375, + "learning_rate": 2.9521414894632797e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1132 + }, + { + "completion_length": 1577.8333435058594, + "epoch": 0.17271341463414633, + "grad_norm": 0.1158075518776461, + "kl": 0.083251953125, + "learning_rate": 2.9519412738961123e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1133 + }, + { + "completion_length": 3020.666748046875, + "epoch": 0.17286585365853657, + "grad_norm": 0.051938371522155614, + "kl": 0.050048828125, + "learning_rate": 2.9517406472207933e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1134 + }, + { + "completion_length": 1922.666748046875, + "epoch": 0.17301829268292682, + "grad_norm": 0.09830806903801641, + "kl": 0.072509765625, + "learning_rate": 2.9515396094941286e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1135 + }, + { + "completion_length": 967.6666717529297, + "epoch": 0.17317073170731706, + "grad_norm": 0.14458264625558273, + "kl": 0.0494384765625, + "learning_rate": 2.9513381607730403e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1136 + }, + { + "completion_length": 1244.8333740234375, + "epoch": 0.1733231707317073, + "grad_norm": 0.1060726018618843, + "kl": 0.0830078125, + "learning_rate": 2.9511363011145683e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1137 + }, + { + "completion_length": 2162.8333740234375, + "epoch": 0.17347560975609755, + "grad_norm": 0.08235672525061287, + "kl": 0.0555419921875, + "learning_rate": 2.9509340305758676e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1138 + }, + { + "completion_length": 2971.6666870117188, + "epoch": 0.1736280487804878, + "grad_norm": 0.07461479627566445, + "kl": 0.0477294921875, + "learning_rate": 2.950731349214209e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1139 + }, + { + "completion_length": 1517.0000610351562, + "epoch": 0.17378048780487804, + "grad_norm": 0.07266218545317131, + "kl": 0.07080078125, + "learning_rate": 2.9505282570869825e-06, + "loss": 0.0028, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1140 + }, + { + "completion_length": 1928.5001220703125, + "epoch": 0.17393292682926828, + "grad_norm": 0.07560115971469125, + "kl": 0.058349609375, + "learning_rate": 2.95032475425169e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1141 + }, + { + "completion_length": 1419.8333740234375, + "epoch": 0.17408536585365852, + "grad_norm": 0.07574862282463261, + "kl": 0.051025390625, + "learning_rate": 2.9501208407659534e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1142 + }, + { + "completion_length": 1870.5, + "epoch": 0.17423780487804877, + "grad_norm": 0.07477819588088092, + "kl": 0.07275390625, + "learning_rate": 2.9499165166875092e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1143 + }, + { + "completion_length": 1597.6667175292969, + "epoch": 0.174390243902439, + "grad_norm": 0.08294156049809069, + "kl": 0.05908203125, + "learning_rate": 2.949711782074211e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1144 + }, + { + "completion_length": 1750.6666870117188, + "epoch": 0.17454268292682926, + "grad_norm": 0.06024745436826176, + "kl": 0.0528564453125, + "learning_rate": 2.949506636984027e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1145 + }, + { + "completion_length": 2238.0000610351562, + "epoch": 0.1746951219512195, + "grad_norm": 0.08268025224823956, + "kl": 0.055419921875, + "learning_rate": 2.9493010814750443e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1146 + }, + { + "completion_length": 1307.6666870117188, + "epoch": 0.17484756097560974, + "grad_norm": 0.10299676367464895, + "kl": 0.05908203125, + "learning_rate": 2.9490951156054634e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1147 + }, + { + "completion_length": 1043.3333740234375, + "epoch": 0.175, + "grad_norm": 0.11969431813024643, + "kl": 0.0828857421875, + "learning_rate": 2.9488887394336023e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1148 + }, + { + "completion_length": 1508.8333740234375, + "epoch": 0.17515243902439023, + "grad_norm": 0.10293274833474697, + "kl": 0.0692138671875, + "learning_rate": 2.948681953017896e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1149 + }, + { + "completion_length": 2698.666748046875, + "epoch": 0.17530487804878048, + "grad_norm": 0.06429760465885294, + "kl": 0.0450439453125, + "learning_rate": 2.948474756416894e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1150 + }, + { + "completion_length": 1335.0, + "epoch": 0.17545731707317072, + "grad_norm": 0.22752305607405976, + "kl": 0.085205078125, + "learning_rate": 2.9482671496892633e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1151 + }, + { + "completion_length": 1555.8333740234375, + "epoch": 0.17560975609756097, + "grad_norm": 0.1339829950845567, + "kl": 0.0626220703125, + "learning_rate": 2.948059132893786e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1152 + }, + { + "completion_length": 2905.666748046875, + "epoch": 0.1757621951219512, + "grad_norm": 0.05811380581613286, + "kl": 0.0367431640625, + "learning_rate": 2.947850706089361e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1153 + }, + { + "completion_length": 2318.0, + "epoch": 0.17591463414634145, + "grad_norm": 0.0712020613032545, + "kl": 0.0633544921875, + "learning_rate": 2.947641869335003e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1154 + }, + { + "completion_length": 1768.8333740234375, + "epoch": 0.1760670731707317, + "grad_norm": 0.08144072847763324, + "kl": 0.072998046875, + "learning_rate": 2.9474326226898426e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1155 + }, + { + "completion_length": 1697.1666870117188, + "epoch": 0.17621951219512194, + "grad_norm": 0.06662315491401204, + "kl": 0.05078125, + "learning_rate": 2.947222966213127e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1156 + }, + { + "completion_length": 2085.5, + "epoch": 0.17637195121951219, + "grad_norm": 0.08900076381201528, + "kl": 0.055419921875, + "learning_rate": 2.9470128999642193e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1157 + }, + { + "completion_length": 2359.5, + "epoch": 0.17652439024390243, + "grad_norm": 0.07485659826494408, + "kl": 0.0533447265625, + "learning_rate": 2.9468024240025985e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1158 + }, + { + "completion_length": 2057.3333740234375, + "epoch": 0.17667682926829267, + "grad_norm": 0.0848869981339714, + "kl": 0.06005859375, + "learning_rate": 2.946591538387859e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1159 + }, + { + "completion_length": 1571.166748046875, + "epoch": 0.17682926829268292, + "grad_norm": 0.08743516941686431, + "kl": 0.0660400390625, + "learning_rate": 2.9463802431797115e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1160 + }, + { + "completion_length": 1222.1666870117188, + "epoch": 0.17698170731707316, + "grad_norm": 0.09652919297560188, + "kl": 0.0791015625, + "learning_rate": 2.9461685384379837e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1161 + }, + { + "completion_length": 3168.83349609375, + "epoch": 0.1771341463414634, + "grad_norm": 0.05989760448584463, + "kl": 0.0516357421875, + "learning_rate": 2.945956424222618e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1162 + }, + { + "completion_length": 2239.6666870117188, + "epoch": 0.17728658536585365, + "grad_norm": 0.08858288056114454, + "kl": 0.0615234375, + "learning_rate": 2.9457439005936744e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1163 + }, + { + "completion_length": 2833.1666870117188, + "epoch": 0.1774390243902439, + "grad_norm": 0.6079024193178411, + "kl": 0.056884765625, + "learning_rate": 2.945530967611326e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1164 + }, + { + "completion_length": 2748.0001220703125, + "epoch": 0.17759146341463414, + "grad_norm": 0.06178984548825931, + "kl": 0.0531005859375, + "learning_rate": 2.945317625335864e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1165 + }, + { + "completion_length": 1889.666748046875, + "epoch": 0.17774390243902438, + "grad_norm": 0.089028728396241, + "kl": 0.073974609375, + "learning_rate": 2.9451038738276956e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1166 + }, + { + "completion_length": 2414.8333740234375, + "epoch": 0.17789634146341463, + "grad_norm": 1.6280583047713824, + "kl": 0.06170654296875, + "learning_rate": 2.9448897131473416e-06, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1167 + }, + { + "completion_length": 2342.166748046875, + "epoch": 0.17804878048780487, + "grad_norm": 0.1029035689329004, + "kl": 0.06640625, + "learning_rate": 2.9446751433554426e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1168 + }, + { + "completion_length": 2226.8333740234375, + "epoch": 0.1782012195121951, + "grad_norm": 1.1024312776696337, + "kl": 0.0560302734375, + "learning_rate": 2.944460164512751e-06, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1169 + }, + { + "completion_length": 1982.1667175292969, + "epoch": 0.17835365853658536, + "grad_norm": 0.07198413682199412, + "kl": 0.0604248046875, + "learning_rate": 2.9442447766801373e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1170 + }, + { + "completion_length": 1223.8333587646484, + "epoch": 0.1785060975609756, + "grad_norm": 0.17227030713014385, + "kl": 0.0936279296875, + "learning_rate": 2.9440289799185868e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1171 + }, + { + "completion_length": 1756.3333740234375, + "epoch": 0.17865853658536585, + "grad_norm": 0.07160218854560149, + "kl": 0.0557861328125, + "learning_rate": 2.9438127742892012e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1172 + }, + { + "completion_length": 1241.0, + "epoch": 0.1788109756097561, + "grad_norm": 0.11059341804305368, + "kl": 0.09521484375, + "learning_rate": 2.9435961598531983e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1173 + }, + { + "completion_length": 1953.0001220703125, + "epoch": 0.17896341463414633, + "grad_norm": 0.09292358553598742, + "kl": 0.08349609375, + "learning_rate": 2.9433791366719103e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1174 + }, + { + "completion_length": 1694.0000610351562, + "epoch": 0.17911585365853658, + "grad_norm": 0.13991151653012335, + "kl": 0.06103515625, + "learning_rate": 2.943161704806787e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1175 + }, + { + "completion_length": 1067.0000610351562, + "epoch": 0.17926829268292682, + "grad_norm": 0.13061962984359865, + "kl": 0.081787109375, + "learning_rate": 2.942943864319392e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1176 + }, + { + "completion_length": 1638.3333435058594, + "epoch": 0.17942073170731707, + "grad_norm": 0.102272472357198, + "kl": 0.0635986328125, + "learning_rate": 2.9427256152714055e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1177 + }, + { + "completion_length": 1722.8333740234375, + "epoch": 0.1795731707317073, + "grad_norm": 0.08012629865885412, + "kl": 0.0562744140625, + "learning_rate": 2.9425069577246243e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1178 + }, + { + "completion_length": 1368.1666870117188, + "epoch": 0.17972560975609755, + "grad_norm": 0.16390434215879449, + "kl": 0.070068359375, + "learning_rate": 2.942287891740959e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1179 + }, + { + "completion_length": 3021.33349609375, + "epoch": 0.1798780487804878, + "grad_norm": 1.4827798326592674, + "kl": 0.0635986328125, + "learning_rate": 2.9420684173824365e-06, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1180 + }, + { + "completion_length": 2336.166748046875, + "epoch": 0.18003048780487804, + "grad_norm": 0.07063102483639144, + "kl": 0.0694580078125, + "learning_rate": 2.9418485347112007e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1181 + }, + { + "completion_length": 892.0, + "epoch": 0.1801829268292683, + "grad_norm": 0.11098431005092406, + "kl": 0.0693359375, + "learning_rate": 2.9416282437895092e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1182 + }, + { + "completion_length": 2579.5001220703125, + "epoch": 0.18033536585365853, + "grad_norm": 0.10690793687719077, + "kl": 0.06884765625, + "learning_rate": 2.9414075446797365e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1183 + }, + { + "completion_length": 1634.166748046875, + "epoch": 0.18048780487804877, + "grad_norm": 0.08444010277014885, + "kl": 0.0712890625, + "learning_rate": 2.941186437444372e-06, + "loss": 0.0028, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1184 + }, + { + "completion_length": 1717.3333740234375, + "epoch": 0.18064024390243902, + "grad_norm": 0.10494978987454288, + "kl": 0.083740234375, + "learning_rate": 2.94096492214602e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1185 + }, + { + "completion_length": 1726.5000610351562, + "epoch": 0.18079268292682926, + "grad_norm": 0.5430882394295515, + "kl": 0.0740966796875, + "learning_rate": 2.9407429988474027e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1186 + }, + { + "completion_length": 2297.166748046875, + "epoch": 0.1809451219512195, + "grad_norm": 0.12384728891377819, + "kl": 0.0775146484375, + "learning_rate": 2.940520667611355e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1187 + }, + { + "completion_length": 3284.5001220703125, + "epoch": 0.18109756097560975, + "grad_norm": 0.07487183426596854, + "kl": 0.056640625, + "learning_rate": 2.94029792850083e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1188 + }, + { + "completion_length": 2630.6666870117188, + "epoch": 0.18125, + "grad_norm": 0.11196006250647925, + "kl": 0.074951171875, + "learning_rate": 2.940074781578893e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1189 + }, + { + "completion_length": 2627.8333740234375, + "epoch": 0.18140243902439024, + "grad_norm": 1.3414190624050333, + "kl": 0.078125, + "learning_rate": 2.939851226908728e-06, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1190 + }, + { + "completion_length": 1495.0, + "epoch": 0.18155487804878048, + "grad_norm": 0.08915585811926485, + "kl": 0.08740234375, + "learning_rate": 2.9396272645536334e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1191 + }, + { + "completion_length": 2576.8333740234375, + "epoch": 0.18170731707317073, + "grad_norm": 0.19124331301519432, + "kl": 0.0738525390625, + "learning_rate": 2.939402894577022e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1192 + }, + { + "completion_length": 2586.0001220703125, + "epoch": 0.18185975609756097, + "grad_norm": 0.08277606778158185, + "kl": 0.0660400390625, + "learning_rate": 2.9391781170424227e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1193 + }, + { + "completion_length": 2233.3334350585938, + "epoch": 0.18201219512195121, + "grad_norm": 0.2332981907732422, + "kl": 0.080322265625, + "learning_rate": 2.93895293201348e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1194 + }, + { + "completion_length": 3305.166748046875, + "epoch": 0.18216463414634146, + "grad_norm": 0.05880849286506802, + "kl": 0.0445556640625, + "learning_rate": 2.938727339553954e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1195 + }, + { + "completion_length": 2852.5001220703125, + "epoch": 0.1823170731707317, + "grad_norm": 0.07406768127556236, + "kl": 0.073974609375, + "learning_rate": 2.9385013397277197e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1196 + }, + { + "completion_length": 3073.166748046875, + "epoch": 0.18246951219512195, + "grad_norm": 0.1399432542623397, + "kl": 0.0546875, + "learning_rate": 2.9382749325987668e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1197 + }, + { + "completion_length": 2684.1666870117188, + "epoch": 0.1826219512195122, + "grad_norm": 0.08944063069997174, + "kl": 0.0714111328125, + "learning_rate": 2.9380481182312026e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1198 + }, + { + "completion_length": 3328.5, + "epoch": 0.18277439024390243, + "grad_norm": 0.04688734604596107, + "kl": 0.04638671875, + "learning_rate": 2.9378208966892464e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1199 + }, + { + "completion_length": 3627.666748046875, + "epoch": 0.18292682926829268, + "grad_norm": 0.046255092581180185, + "kl": 0.03955078125, + "learning_rate": 2.9375932680372358e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1200 + }, + { + "completion_length": 2224.83349609375, + "epoch": 0.18307926829268292, + "grad_norm": 0.09065079076557406, + "kl": 0.0611572265625, + "learning_rate": 2.9373652323396222e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1201 + }, + { + "completion_length": 2422.666748046875, + "epoch": 0.18323170731707317, + "grad_norm": 0.10259359457464173, + "kl": 0.0562744140625, + "learning_rate": 2.937136789660972e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1202 + }, + { + "completion_length": 2048.3333740234375, + "epoch": 0.1833841463414634, + "grad_norm": 0.08474882876255574, + "kl": 0.059814453125, + "learning_rate": 2.9369079400659676e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1203 + }, + { + "completion_length": 2694.0001220703125, + "epoch": 0.18353658536585366, + "grad_norm": 0.06753838217291908, + "kl": 0.0711669921875, + "learning_rate": 2.936678683619407e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1204 + }, + { + "completion_length": 2523.8333740234375, + "epoch": 0.1836890243902439, + "grad_norm": 0.06749565916670915, + "kl": 0.05419921875, + "learning_rate": 2.936449020386202e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1205 + }, + { + "completion_length": 2755.666748046875, + "epoch": 0.18384146341463414, + "grad_norm": 0.08012927892052915, + "kl": 0.0562744140625, + "learning_rate": 2.93621895043138e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1206 + }, + { + "completion_length": 2789.166748046875, + "epoch": 0.1839939024390244, + "grad_norm": 0.08933122999966757, + "kl": 0.0491943359375, + "learning_rate": 2.9359884738200845e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1207 + }, + { + "completion_length": 1186.0, + "epoch": 0.18414634146341463, + "grad_norm": 0.106982630545841, + "kl": 0.087158203125, + "learning_rate": 2.935757590617574e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1208 + }, + { + "completion_length": 3016.3333740234375, + "epoch": 0.18429878048780488, + "grad_norm": 0.15706773425649292, + "kl": 0.0511474609375, + "learning_rate": 2.9355263008892205e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1209 + }, + { + "completion_length": 2622.5001220703125, + "epoch": 0.18445121951219512, + "grad_norm": 0.08350142143358788, + "kl": 0.0771484375, + "learning_rate": 2.9352946047005128e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1210 + }, + { + "completion_length": 2909.0, + "epoch": 0.18460365853658536, + "grad_norm": 0.26589590493033133, + "kl": 0.0748291015625, + "learning_rate": 2.9350625021170542e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1211 + }, + { + "completion_length": 2452.0001220703125, + "epoch": 0.1847560975609756, + "grad_norm": 0.06741606600231928, + "kl": 0.0577392578125, + "learning_rate": 2.9348299932045632e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1212 + }, + { + "completion_length": 3009.8333740234375, + "epoch": 0.18490853658536585, + "grad_norm": 0.05110697844822714, + "kl": 0.0450439453125, + "learning_rate": 2.934597078028873e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1213 + }, + { + "completion_length": 2343.5000610351562, + "epoch": 0.1850609756097561, + "grad_norm": 0.1299340496815928, + "kl": 0.0711669921875, + "learning_rate": 2.9343637566559326e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1214 + }, + { + "completion_length": 1628.3333740234375, + "epoch": 0.18521341463414634, + "grad_norm": 0.07462220509926719, + "kl": 0.059814453125, + "learning_rate": 2.934130029151805e-06, + "loss": 0.0024, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1215 + }, + { + "completion_length": 2974.6666870117188, + "epoch": 0.18536585365853658, + "grad_norm": 0.057136378884765304, + "kl": 0.0498046875, + "learning_rate": 2.9338958955826685e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1216 + }, + { + "completion_length": 2371.3333740234375, + "epoch": 0.18551829268292683, + "grad_norm": 0.060966025356242876, + "kl": 0.06884765625, + "learning_rate": 2.933661356014817e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1217 + }, + { + "completion_length": 2948.0, + "epoch": 0.18567073170731707, + "grad_norm": 0.05514948590437257, + "kl": 0.04583740234375, + "learning_rate": 2.9334264105146594e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1218 + }, + { + "completion_length": 3568.0, + "epoch": 0.18582317073170732, + "grad_norm": 0.047222861601077575, + "kl": 0.04052734375, + "learning_rate": 2.9331910591487182e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1219 + }, + { + "completion_length": 1951.8333740234375, + "epoch": 0.18597560975609756, + "grad_norm": 0.09600283556132888, + "kl": 0.06689453125, + "learning_rate": 2.932955301983631e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1220 + }, + { + "completion_length": 1360.6666870117188, + "epoch": 0.1861280487804878, + "grad_norm": 0.12153755374270812, + "kl": 0.085205078125, + "learning_rate": 2.9327191390861534e-06, + "loss": 0.0034, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1221 + }, + { + "completion_length": 1187.0000610351562, + "epoch": 0.18628048780487805, + "grad_norm": 0.13340222928305245, + "kl": 0.103759765625, + "learning_rate": 2.9324825705231512e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1222 + }, + { + "completion_length": 2578.5, + "epoch": 0.1864329268292683, + "grad_norm": 0.08444691794930316, + "kl": 0.0560302734375, + "learning_rate": 2.9322455963616084e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1223 + }, + { + "completion_length": 1627.8333435058594, + "epoch": 0.18658536585365854, + "grad_norm": 1.9184537985298167, + "kl": 0.06298828125, + "learning_rate": 2.9320082166686226e-06, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1224 + }, + { + "completion_length": 1808.8334350585938, + "epoch": 0.18673780487804878, + "grad_norm": 0.16275828069682802, + "kl": 0.0772705078125, + "learning_rate": 2.9317704315114055e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1225 + }, + { + "completion_length": 2599.5, + "epoch": 0.18689024390243902, + "grad_norm": 0.07140086155362439, + "kl": 0.0657958984375, + "learning_rate": 2.931532240957286e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1226 + }, + { + "completion_length": 2106.5000610351562, + "epoch": 0.18704268292682927, + "grad_norm": 0.07937012620529031, + "kl": 0.0670166015625, + "learning_rate": 2.931293645073705e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1227 + }, + { + "completion_length": 1871.8333740234375, + "epoch": 0.1871951219512195, + "grad_norm": 0.10073259951835242, + "kl": 0.0811767578125, + "learning_rate": 2.9310546439282207e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1228 + }, + { + "completion_length": 1604.3333740234375, + "epoch": 0.18734756097560976, + "grad_norm": 0.09501961151704087, + "kl": 0.0693359375, + "learning_rate": 2.9308152375885033e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1229 + }, + { + "completion_length": 2161.666748046875, + "epoch": 0.1875, + "grad_norm": 0.0966587660482477, + "kl": 0.056396484375, + "learning_rate": 2.9305754261223403e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1230 + }, + { + "completion_length": 2448.0, + "epoch": 0.18765243902439024, + "grad_norm": 0.06128652987722124, + "kl": 0.0552978515625, + "learning_rate": 2.9303352095976324e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1231 + }, + { + "completion_length": 2816.3333740234375, + "epoch": 0.1878048780487805, + "grad_norm": 0.2445131769438128, + "kl": 0.0653076171875, + "learning_rate": 2.9300945880823955e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1232 + }, + { + "completion_length": 1717.5, + "epoch": 0.18795731707317073, + "grad_norm": 0.09718344393994638, + "kl": 0.0751953125, + "learning_rate": 2.9298535616447607e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1233 + }, + { + "completion_length": 1157.1667175292969, + "epoch": 0.18810975609756098, + "grad_norm": 0.15839533714022883, + "kl": 0.107421875, + "learning_rate": 2.929612130352972e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1234 + }, + { + "completion_length": 2146.8333740234375, + "epoch": 0.18826219512195122, + "grad_norm": 0.08041555286036065, + "kl": 0.0621337890625, + "learning_rate": 2.9293702942753898e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1235 + }, + { + "completion_length": 1620.3333740234375, + "epoch": 0.18841463414634146, + "grad_norm": 0.15388510576019437, + "kl": 0.106201171875, + "learning_rate": 2.9291280534804884e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1236 + }, + { + "completion_length": 3380.8333740234375, + "epoch": 0.1885670731707317, + "grad_norm": 0.04562157951947845, + "kl": 0.044921875, + "learning_rate": 2.928885408036857e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1237 + }, + { + "completion_length": 1596.3333740234375, + "epoch": 0.18871951219512195, + "grad_norm": 0.09131190539370301, + "kl": 0.0816650390625, + "learning_rate": 2.9286423580131984e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1238 + }, + { + "completion_length": 2536.8333740234375, + "epoch": 0.1888719512195122, + "grad_norm": 0.08570277678885639, + "kl": 0.0599365234375, + "learning_rate": 2.928398903478332e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1239 + }, + { + "completion_length": 1692.8334197998047, + "epoch": 0.18902439024390244, + "grad_norm": 0.13058529052126522, + "kl": 0.0899658203125, + "learning_rate": 2.928155044501189e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1240 + }, + { + "completion_length": 1934.3333740234375, + "epoch": 0.18917682926829268, + "grad_norm": 0.08667474236854066, + "kl": 0.045654296875, + "learning_rate": 2.9279107811508176e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1241 + }, + { + "completion_length": 3145.3333740234375, + "epoch": 0.18932926829268293, + "grad_norm": 0.055511563615782744, + "kl": 0.052001953125, + "learning_rate": 2.9276661134963784e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1242 + }, + { + "completion_length": 3200.3333740234375, + "epoch": 0.18948170731707317, + "grad_norm": 0.29471727515812324, + "kl": 0.06396484375, + "learning_rate": 2.9274210416071487e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1243 + }, + { + "completion_length": 2354.3333740234375, + "epoch": 0.18963414634146342, + "grad_norm": 0.05881843090599001, + "kl": 0.0587158203125, + "learning_rate": 2.9271755655525186e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1244 + }, + { + "completion_length": 2295.0000610351562, + "epoch": 0.18978658536585366, + "grad_norm": 0.08165676950062939, + "kl": 0.0565185546875, + "learning_rate": 2.9269296854019932e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1245 + }, + { + "completion_length": 1182.3333740234375, + "epoch": 0.1899390243902439, + "grad_norm": 0.0783595552889891, + "kl": 0.07275390625, + "learning_rate": 2.9266834012251914e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1246 + }, + { + "completion_length": 2469.3333740234375, + "epoch": 0.19009146341463415, + "grad_norm": 0.08617038736658883, + "kl": 0.071533203125, + "learning_rate": 2.926436713091848e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1247 + }, + { + "completion_length": 2481.166748046875, + "epoch": 0.1902439024390244, + "grad_norm": 0.049503635597878785, + "kl": 0.055419921875, + "learning_rate": 2.9261896210718106e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1248 + }, + { + "completion_length": 1613.0000610351562, + "epoch": 0.19039634146341464, + "grad_norm": 0.07292045268744013, + "kl": 0.05810546875, + "learning_rate": 2.925942125235042e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1249 + }, + { + "completion_length": 2174.0000610351562, + "epoch": 0.19054878048780488, + "grad_norm": 0.1201860455729421, + "kl": 0.0655517578125, + "learning_rate": 2.9256942256516185e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1250 + }, + { + "completion_length": 1113.6666870117188, + "epoch": 0.19070121951219512, + "grad_norm": 0.1310371296391152, + "kl": 0.053466796875, + "learning_rate": 2.9254459223917323e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1251 + }, + { + "completion_length": 2424.5, + "epoch": 0.19085365853658537, + "grad_norm": 0.047881929386776885, + "kl": 0.0457763671875, + "learning_rate": 2.925197215525688e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1252 + }, + { + "completion_length": 1572.1666870117188, + "epoch": 0.1910060975609756, + "grad_norm": 0.1486684817234935, + "kl": 0.0650634765625, + "learning_rate": 2.924948105123906e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1253 + }, + { + "completion_length": 1786.6666870117188, + "epoch": 0.19115853658536586, + "grad_norm": 0.06995478658877599, + "kl": 0.060791015625, + "learning_rate": 2.92469859125692e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1254 + }, + { + "completion_length": 2191.3333740234375, + "epoch": 0.1913109756097561, + "grad_norm": 0.17006890250489975, + "kl": 0.071533203125, + "learning_rate": 2.9244486739953787e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1255 + }, + { + "completion_length": 2116.3333740234375, + "epoch": 0.19146341463414634, + "grad_norm": 0.13415640185878985, + "kl": 0.0816650390625, + "learning_rate": 2.924198353410044e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1256 + }, + { + "completion_length": 2576.0001220703125, + "epoch": 0.1916158536585366, + "grad_norm": 0.13775849413456784, + "kl": 0.0718994140625, + "learning_rate": 2.9239476295717938e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1257 + }, + { + "completion_length": 2198.666748046875, + "epoch": 0.19176829268292683, + "grad_norm": 0.07445852606659376, + "kl": 0.0567626953125, + "learning_rate": 2.9236965025516174e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1258 + }, + { + "completion_length": 3116.8333740234375, + "epoch": 0.19192073170731708, + "grad_norm": 0.08915788044914044, + "kl": 0.061767578125, + "learning_rate": 2.9234449724206212e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1259 + }, + { + "completion_length": 1558.5001220703125, + "epoch": 0.19207317073170732, + "grad_norm": 1.5008726968348873, + "kl": 0.070556640625, + "learning_rate": 2.923193039250024e-06, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1260 + }, + { + "completion_length": 2720.0, + "epoch": 0.19222560975609757, + "grad_norm": 2.0144250221205797, + "kl": 0.057373046875, + "learning_rate": 2.9229407031111586e-06, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1261 + }, + { + "completion_length": 2283.666748046875, + "epoch": 0.1923780487804878, + "grad_norm": 0.08055256059829918, + "kl": 0.075927734375, + "learning_rate": 2.9226879640754727e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1262 + }, + { + "completion_length": 2873.8333740234375, + "epoch": 0.19253048780487805, + "grad_norm": 0.08728229685463308, + "kl": 0.0567626953125, + "learning_rate": 2.922434822214528e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1263 + }, + { + "completion_length": 2338.166748046875, + "epoch": 0.1926829268292683, + "grad_norm": 0.8576965366385138, + "kl": 0.06689453125, + "learning_rate": 2.9221812776000003e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1264 + }, + { + "completion_length": 2262.3333740234375, + "epoch": 0.19283536585365854, + "grad_norm": 0.12365692160767333, + "kl": 0.05517578125, + "learning_rate": 2.9219273303036784e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1265 + }, + { + "completion_length": 2698.5, + "epoch": 0.19298780487804879, + "grad_norm": 0.08065068818036925, + "kl": 0.0560302734375, + "learning_rate": 2.9216729803974666e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1266 + }, + { + "completion_length": 2375.0001220703125, + "epoch": 0.19314024390243903, + "grad_norm": 0.06682303456564637, + "kl": 0.068603515625, + "learning_rate": 2.921418227953382e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1267 + }, + { + "completion_length": 1931.0, + "epoch": 0.19329268292682927, + "grad_norm": 0.08958543819824148, + "kl": 0.0567626953125, + "learning_rate": 2.9211630730435564e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1268 + }, + { + "completion_length": 2285.666748046875, + "epoch": 0.19344512195121952, + "grad_norm": 0.09095895545996195, + "kl": 0.064208984375, + "learning_rate": 2.9209075157402358e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1269 + }, + { + "completion_length": 3090.83349609375, + "epoch": 0.19359756097560976, + "grad_norm": 0.13374926880514784, + "kl": 0.0543212890625, + "learning_rate": 2.9206515561157783e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1270 + }, + { + "completion_length": 2002.5001220703125, + "epoch": 0.19375, + "grad_norm": 0.2538440379470808, + "kl": 0.136474609375, + "learning_rate": 2.9203951942426586e-06, + "loss": 0.0055, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1271 + }, + { + "completion_length": 3686.0, + "epoch": 0.19390243902439025, + "grad_norm": 0.13972074936882597, + "kl": 0.0523681640625, + "learning_rate": 2.9201384301934632e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1272 + }, + { + "completion_length": 2830.0, + "epoch": 0.1940548780487805, + "grad_norm": 0.13407900742257128, + "kl": 0.092529296875, + "learning_rate": 2.919881264040894e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1273 + }, + { + "completion_length": 2397.5000610351562, + "epoch": 0.19420731707317074, + "grad_norm": 0.062378063946491265, + "kl": 0.054443359375, + "learning_rate": 2.9196236958577658e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1274 + }, + { + "completion_length": 2295.666748046875, + "epoch": 0.19435975609756098, + "grad_norm": 0.08601057358275357, + "kl": 0.0601806640625, + "learning_rate": 2.9193657257170066e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1275 + }, + { + "completion_length": 2409.166748046875, + "epoch": 0.19451219512195123, + "grad_norm": 0.08833165451253937, + "kl": 0.0577392578125, + "learning_rate": 2.91910735369166e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1276 + }, + { + "completion_length": 2529.3333740234375, + "epoch": 0.19466463414634147, + "grad_norm": 0.14983318169786122, + "kl": 0.0625, + "learning_rate": 2.918848579854882e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1277 + }, + { + "completion_length": 2492.8333435058594, + "epoch": 0.1948170731707317, + "grad_norm": 0.10297219988644553, + "kl": 0.069091796875, + "learning_rate": 2.9185894042799423e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1278 + }, + { + "completion_length": 3171.0, + "epoch": 0.19496951219512196, + "grad_norm": 0.1804697491352403, + "kl": 0.064453125, + "learning_rate": 2.918329827040226e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1279 + }, + { + "completion_length": 2856.166748046875, + "epoch": 0.1951219512195122, + "grad_norm": 0.08641446974986454, + "kl": 0.057861328125, + "learning_rate": 2.9180698482092302e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1280 + }, + { + "completion_length": 3073.0001220703125, + "epoch": 0.19527439024390245, + "grad_norm": 0.06557929704584131, + "kl": 0.054443359375, + "learning_rate": 2.917809467860566e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1281 + }, + { + "completion_length": 3248.0, + "epoch": 0.1954268292682927, + "grad_norm": 0.058370597234837715, + "kl": 0.04364013671875, + "learning_rate": 2.917548686067959e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1282 + }, + { + "completion_length": 2848.5, + "epoch": 0.19557926829268293, + "grad_norm": 0.06181808398794007, + "kl": 0.059814453125, + "learning_rate": 2.917287502905248e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1283 + }, + { + "completion_length": 2929.83349609375, + "epoch": 0.19573170731707318, + "grad_norm": 0.07083181382721707, + "kl": 0.053466796875, + "learning_rate": 2.917025918446385e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1284 + }, + { + "completion_length": 2316.666748046875, + "epoch": 0.19588414634146342, + "grad_norm": 4.5798526793344125, + "kl": 0.1583251953125, + "learning_rate": 2.9167639327654355e-06, + "loss": 0.0063, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1285 + }, + { + "completion_length": 2052.0000915527344, + "epoch": 0.19603658536585367, + "grad_norm": 0.1853168254033053, + "kl": 0.0565185546875, + "learning_rate": 2.9165015459365808e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1286 + }, + { + "completion_length": 2691.1666870117188, + "epoch": 0.1961890243902439, + "grad_norm": 0.05971139282546335, + "kl": 0.05364990234375, + "learning_rate": 2.916238758034112e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1287 + }, + { + "completion_length": 2889.6666870117188, + "epoch": 0.19634146341463415, + "grad_norm": 0.08405596071378987, + "kl": 0.0594482421875, + "learning_rate": 2.9159755691324377e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1288 + }, + { + "completion_length": 3254.0, + "epoch": 0.1964939024390244, + "grad_norm": 0.06295822947103566, + "kl": 0.0623779296875, + "learning_rate": 2.9157119793060773e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1289 + }, + { + "completion_length": 1945.3333740234375, + "epoch": 0.19664634146341464, + "grad_norm": 0.11910430921905757, + "kl": 0.077880859375, + "learning_rate": 2.915447988629664e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1290 + }, + { + "completion_length": 1904.6666870117188, + "epoch": 0.1967987804878049, + "grad_norm": 0.16182413219977645, + "kl": 0.0859375, + "learning_rate": 2.9151835971779465e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1291 + }, + { + "completion_length": 2097.0001220703125, + "epoch": 0.19695121951219513, + "grad_norm": 0.10192073061862841, + "kl": 0.0908203125, + "learning_rate": 2.9149188050257847e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1292 + }, + { + "completion_length": 2440.166748046875, + "epoch": 0.19710365853658537, + "grad_norm": 0.06435942162949486, + "kl": 0.0733642578125, + "learning_rate": 2.9146536122481532e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1293 + }, + { + "completion_length": 3535.83349609375, + "epoch": 0.19725609756097562, + "grad_norm": 0.12495023593326005, + "kl": 0.0714111328125, + "learning_rate": 2.91438801892014e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1294 + }, + { + "completion_length": 2159.0001220703125, + "epoch": 0.19740853658536586, + "grad_norm": 0.076949283035587, + "kl": 0.070068359375, + "learning_rate": 2.914122025116945e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1295 + }, + { + "completion_length": 2444.666748046875, + "epoch": 0.1975609756097561, + "grad_norm": 0.1965693223190178, + "kl": 0.071533203125, + "learning_rate": 2.913855630913884e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1296 + }, + { + "completion_length": 1110.6667175292969, + "epoch": 0.19771341463414635, + "grad_norm": 0.10034529093706564, + "kl": 0.0810546875, + "learning_rate": 2.9135888363863843e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1297 + }, + { + "completion_length": 2826.8333740234375, + "epoch": 0.1978658536585366, + "grad_norm": 0.07776734123050685, + "kl": 0.0528564453125, + "learning_rate": 2.913321641609987e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1298 + }, + { + "completion_length": 2649.666748046875, + "epoch": 0.19801829268292684, + "grad_norm": 0.07769417843375115, + "kl": 0.061767578125, + "learning_rate": 2.913054046660347e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1299 + }, + { + "completion_length": 2821.33349609375, + "epoch": 0.19817073170731708, + "grad_norm": 0.05951931644972631, + "kl": 0.060791015625, + "learning_rate": 2.912786051613232e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1300 + }, + { + "completion_length": 2601.0001220703125, + "epoch": 0.19832317073170733, + "grad_norm": 0.05217343544449356, + "kl": 0.044677734375, + "learning_rate": 2.912517656544523e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1301 + }, + { + "completion_length": 2395.1666870117188, + "epoch": 0.19847560975609757, + "grad_norm": 0.06146593320779723, + "kl": 0.052734375, + "learning_rate": 2.912248861530214e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1302 + }, + { + "completion_length": 3108.0, + "epoch": 0.19862804878048781, + "grad_norm": 0.05462271244859334, + "kl": 0.0439453125, + "learning_rate": 2.911979666646414e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1303 + }, + { + "completion_length": 2245.0001220703125, + "epoch": 0.19878048780487806, + "grad_norm": 0.12547800401523476, + "kl": 0.057373046875, + "learning_rate": 2.911710071969342e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1304 + }, + { + "completion_length": 3113.3333740234375, + "epoch": 0.1989329268292683, + "grad_norm": 0.07002798852849262, + "kl": 0.052978515625, + "learning_rate": 2.911440077575334e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1305 + }, + { + "completion_length": 1849.166748046875, + "epoch": 0.19908536585365855, + "grad_norm": 3.2937895597990514, + "kl": 0.08544921875, + "learning_rate": 2.9111696835408356e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1306 + }, + { + "completion_length": 3698.666748046875, + "epoch": 0.1992378048780488, + "grad_norm": 0.04197951050652988, + "kl": 0.0435791015625, + "learning_rate": 2.9108988899424085e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1307 + }, + { + "completion_length": 2941.3333740234375, + "epoch": 0.19939024390243903, + "grad_norm": 0.1031100276190628, + "kl": 0.0576171875, + "learning_rate": 2.910627696856725e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1308 + }, + { + "completion_length": 1484.0, + "epoch": 0.19954268292682928, + "grad_norm": 0.11004858733930796, + "kl": 0.0902099609375, + "learning_rate": 2.9103561043605727e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1309 + }, + { + "completion_length": 2582.166748046875, + "epoch": 0.19969512195121952, + "grad_norm": 0.077975572595022, + "kl": 0.0689697265625, + "learning_rate": 2.9100841125308502e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1310 + }, + { + "completion_length": 1791.166748046875, + "epoch": 0.19984756097560977, + "grad_norm": 0.24378914564557752, + "kl": 0.0643310546875, + "learning_rate": 2.909811721444572e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1311 + }, + { + "completion_length": 2073.3333740234375, + "epoch": 0.2, + "grad_norm": 0.09520074395939339, + "kl": 0.046630859375, + "learning_rate": 2.9095389311788626e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1312 + }, + { + "completion_length": 3019.1666870117188, + "epoch": 0.20015243902439026, + "grad_norm": 0.08558248752476771, + "kl": 0.0577392578125, + "learning_rate": 2.909265741810961e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1313 + }, + { + "completion_length": 1807.8334350585938, + "epoch": 0.2003048780487805, + "grad_norm": 0.14070786341685473, + "kl": 0.0726318359375, + "learning_rate": 2.90899215341822e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1314 + }, + { + "completion_length": 1714.0000610351562, + "epoch": 0.20045731707317074, + "grad_norm": 0.08198919172066624, + "kl": 0.0712890625, + "learning_rate": 2.9087181660781035e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1315 + }, + { + "completion_length": 3436.666748046875, + "epoch": 0.200609756097561, + "grad_norm": 0.045558459072977696, + "kl": 0.047607421875, + "learning_rate": 2.9084437798681894e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1316 + }, + { + "completion_length": 2585.3333740234375, + "epoch": 0.20076219512195123, + "grad_norm": 0.05818031515929264, + "kl": 0.0552978515625, + "learning_rate": 2.9081689948661686e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1317 + }, + { + "completion_length": 1576.3333740234375, + "epoch": 0.20091463414634148, + "grad_norm": 0.09113791036084548, + "kl": 0.083984375, + "learning_rate": 2.907893811149845e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1318 + }, + { + "completion_length": 3517.8333740234375, + "epoch": 0.20106707317073172, + "grad_norm": 0.039803074386764295, + "kl": 0.039794921875, + "learning_rate": 2.907618228797135e-06, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1319 + }, + { + "completion_length": 3413.0, + "epoch": 0.20121951219512196, + "grad_norm": 0.04234719872569446, + "kl": 0.0372314453125, + "learning_rate": 2.9073422478860678e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1320 + }, + { + "completion_length": 3402.166748046875, + "epoch": 0.2013719512195122, + "grad_norm": 0.056129677227094325, + "kl": 0.04144287109375, + "learning_rate": 2.907065868494786e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1321 + }, + { + "completion_length": 2540.0, + "epoch": 0.20152439024390245, + "grad_norm": 0.08120904524740385, + "kl": 0.07421875, + "learning_rate": 2.906789090701545e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1322 + }, + { + "completion_length": 2151.0000610351562, + "epoch": 0.2016768292682927, + "grad_norm": 0.14514773291280342, + "kl": 0.07177734375, + "learning_rate": 2.9065119145847118e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1323 + }, + { + "completion_length": 2277.166748046875, + "epoch": 0.20182926829268294, + "grad_norm": 0.11415971222650499, + "kl": 0.070556640625, + "learning_rate": 2.906234340222768e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1324 + }, + { + "completion_length": 1720.8333740234375, + "epoch": 0.20198170731707318, + "grad_norm": 0.0745137484082003, + "kl": 0.062255859375, + "learning_rate": 2.905956367694306e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1325 + }, + { + "completion_length": 3268.0, + "epoch": 0.20213414634146343, + "grad_norm": 0.1249660584549861, + "kl": 0.0460205078125, + "learning_rate": 2.9056779970780334e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1326 + }, + { + "completion_length": 2029.3333740234375, + "epoch": 0.20228658536585367, + "grad_norm": 0.07490028926286499, + "kl": 0.05078125, + "learning_rate": 2.9053992284527682e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1327 + }, + { + "completion_length": 2789.666748046875, + "epoch": 0.20243902439024392, + "grad_norm": 0.05094880632798978, + "kl": 0.0477294921875, + "learning_rate": 2.9051200618974418e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1328 + }, + { + "completion_length": 1841.3333740234375, + "epoch": 0.20259146341463416, + "grad_norm": 0.06515534518593813, + "kl": 0.0413818359375, + "learning_rate": 2.904840497491099e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1329 + }, + { + "completion_length": 2445.166748046875, + "epoch": 0.2027439024390244, + "grad_norm": 0.08834350537247994, + "kl": 0.060546875, + "learning_rate": 2.904560535312897e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1330 + }, + { + "completion_length": 1375.8333435058594, + "epoch": 0.20289634146341465, + "grad_norm": 0.11522566388502041, + "kl": 0.07763671875, + "learning_rate": 2.9042801754421043e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1331 + }, + { + "completion_length": 2323.0, + "epoch": 0.2030487804878049, + "grad_norm": 0.18877139496646922, + "kl": 0.0609130859375, + "learning_rate": 2.903999417958104e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1332 + }, + { + "completion_length": 840.6666717529297, + "epoch": 0.2032012195121951, + "grad_norm": 0.6458905047111411, + "kl": 0.0552978515625, + "learning_rate": 2.9037182629403906e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1333 + }, + { + "completion_length": 2611.5001220703125, + "epoch": 0.20335365853658535, + "grad_norm": 0.04334055987264273, + "kl": 0.0516357421875, + "learning_rate": 2.903436710468571e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1334 + }, + { + "completion_length": 1360.6666870117188, + "epoch": 0.2035060975609756, + "grad_norm": 0.10308427316959858, + "kl": 0.0701904296875, + "learning_rate": 2.9031547606223657e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1335 + }, + { + "completion_length": 1283.0000610351562, + "epoch": 0.20365853658536584, + "grad_norm": 0.11094609627760041, + "kl": 0.082763671875, + "learning_rate": 2.9028724134816064e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1336 + }, + { + "completion_length": 892.8333435058594, + "epoch": 0.20381097560975608, + "grad_norm": 0.10362878323293989, + "kl": 0.08447265625, + "learning_rate": 2.9025896691262385e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1337 + }, + { + "completion_length": 2667.166748046875, + "epoch": 0.20396341463414633, + "grad_norm": 0.053237978840103097, + "kl": 0.0491943359375, + "learning_rate": 2.902306527636319e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1338 + }, + { + "completion_length": 2076.666748046875, + "epoch": 0.20411585365853657, + "grad_norm": 0.10484836544949115, + "kl": 0.083740234375, + "learning_rate": 2.9020229890920176e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1339 + }, + { + "completion_length": 1603.3334350585938, + "epoch": 0.20426829268292682, + "grad_norm": 0.13071366410487903, + "kl": 0.0771484375, + "learning_rate": 2.9017390535736164e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1340 + }, + { + "completion_length": 1618.3334350585938, + "epoch": 0.20442073170731706, + "grad_norm": 0.08913710658452254, + "kl": 0.051513671875, + "learning_rate": 2.90145472116151e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1341 + }, + { + "completion_length": 1661.1667175292969, + "epoch": 0.2045731707317073, + "grad_norm": 0.09567597139578354, + "kl": 0.059814453125, + "learning_rate": 2.9011699919362064e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1342 + }, + { + "completion_length": 1443.5, + "epoch": 0.20472560975609755, + "grad_norm": 0.11696832429448804, + "kl": 0.078369140625, + "learning_rate": 2.9008848659783236e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1343 + }, + { + "completion_length": 1584.6666870117188, + "epoch": 0.2048780487804878, + "grad_norm": 1.7614181074615258, + "kl": 0.0594482421875, + "learning_rate": 2.9005993433685932e-06, + "loss": 0.0024, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1344 + }, + { + "completion_length": 2885.166748046875, + "epoch": 0.20503048780487804, + "grad_norm": 0.11741872461218095, + "kl": 0.0443115234375, + "learning_rate": 2.90031342418786e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1345 + }, + { + "completion_length": 1524.3333740234375, + "epoch": 0.20518292682926828, + "grad_norm": 0.1829445865489553, + "kl": 0.111328125, + "learning_rate": 2.90002710851708e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1346 + }, + { + "completion_length": 2771.0000610351562, + "epoch": 0.20533536585365852, + "grad_norm": 0.11524969510967063, + "kl": 0.051513671875, + "learning_rate": 2.899740396437321e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1347 + }, + { + "completion_length": 2762.8333740234375, + "epoch": 0.20548780487804877, + "grad_norm": 0.0681581704710903, + "kl": 0.0545654296875, + "learning_rate": 2.899453288029765e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1348 + }, + { + "completion_length": 2944.8333740234375, + "epoch": 0.205640243902439, + "grad_norm": 0.0864995159276782, + "kl": 0.05810546875, + "learning_rate": 2.8991657833757038e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1349 + }, + { + "completion_length": 2236.0000610351562, + "epoch": 0.20579268292682926, + "grad_norm": 0.11086827688790901, + "kl": 0.074462890625, + "learning_rate": 2.8988778825565433e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1350 + }, + { + "completion_length": 2615.0001220703125, + "epoch": 0.2059451219512195, + "grad_norm": 0.12767375320889557, + "kl": 0.0599365234375, + "learning_rate": 2.8985895856538005e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1351 + }, + { + "completion_length": 1956.8334350585938, + "epoch": 0.20609756097560974, + "grad_norm": 0.17696430670071978, + "kl": 0.0677490234375, + "learning_rate": 2.8983008927491046e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1352 + }, + { + "completion_length": 3317.166748046875, + "epoch": 0.20625, + "grad_norm": 0.061986676797229884, + "kl": 0.0546875, + "learning_rate": 2.8980118039241983e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1353 + }, + { + "completion_length": 1715.5, + "epoch": 0.20640243902439023, + "grad_norm": 0.10426089438624886, + "kl": 0.06884765625, + "learning_rate": 2.8977223192609336e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1354 + }, + { + "completion_length": 2247.6666870117188, + "epoch": 0.20655487804878048, + "grad_norm": 0.08264951375533967, + "kl": 0.0518798828125, + "learning_rate": 2.8974324388412775e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1355 + }, + { + "completion_length": 2599.1666870117188, + "epoch": 0.20670731707317072, + "grad_norm": 0.16397990364581133, + "kl": 0.0550537109375, + "learning_rate": 2.8971421627473075e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1356 + }, + { + "completion_length": 3024.0, + "epoch": 0.20685975609756097, + "grad_norm": 0.057837530719854605, + "kl": 0.0455322265625, + "learning_rate": 2.896851491061214e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1357 + }, + { + "completion_length": 2940.0001220703125, + "epoch": 0.2070121951219512, + "grad_norm": 0.09751807458328843, + "kl": 0.055908203125, + "learning_rate": 2.896560423865298e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1358 + }, + { + "completion_length": 3331.33349609375, + "epoch": 0.20716463414634145, + "grad_norm": 0.06855398643113415, + "kl": 0.0484619140625, + "learning_rate": 2.896268961241974e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1359 + }, + { + "completion_length": 2631.666748046875, + "epoch": 0.2073170731707317, + "grad_norm": 0.10781971161995804, + "kl": 0.0616455078125, + "learning_rate": 2.8959771032737673e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1360 + }, + { + "completion_length": 3490.8333740234375, + "epoch": 0.20746951219512194, + "grad_norm": 0.06401023813310266, + "kl": 0.049560546875, + "learning_rate": 2.8956848500433164e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1361 + }, + { + "completion_length": 2166.8333740234375, + "epoch": 0.20762195121951219, + "grad_norm": 0.1140831758295194, + "kl": 0.058837890625, + "learning_rate": 2.8953922016333704e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1362 + }, + { + "completion_length": 1588.1666870117188, + "epoch": 0.20777439024390243, + "grad_norm": 0.06939669811503894, + "kl": 0.057861328125, + "learning_rate": 2.8950991581267912e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1363 + }, + { + "completion_length": 2846.5, + "epoch": 0.20792682926829267, + "grad_norm": 0.05631726245579542, + "kl": 0.0509033203125, + "learning_rate": 2.8948057196065517e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1364 + }, + { + "completion_length": 3092.8333740234375, + "epoch": 0.20807926829268292, + "grad_norm": 0.049961682650243, + "kl": 0.0458984375, + "learning_rate": 2.894511886155738e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1365 + }, + { + "completion_length": 2744.8333740234375, + "epoch": 0.20823170731707316, + "grad_norm": 0.07382042988536043, + "kl": 0.0621337890625, + "learning_rate": 2.8942176578575465e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1366 + }, + { + "completion_length": 2323.5001220703125, + "epoch": 0.2083841463414634, + "grad_norm": 0.12415409838098235, + "kl": 0.0625, + "learning_rate": 2.8939230347952867e-06, + "loss": 0.0025, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1367 + }, + { + "completion_length": 3155.166748046875, + "epoch": 0.20853658536585365, + "grad_norm": 0.06112003959519557, + "kl": 0.0516357421875, + "learning_rate": 2.8936280170523784e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1368 + }, + { + "completion_length": 1861.8333740234375, + "epoch": 0.2086890243902439, + "grad_norm": 0.08715395939465305, + "kl": 0.06494140625, + "learning_rate": 2.8933326047123556e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1369 + }, + { + "completion_length": 3366.8333740234375, + "epoch": 0.20884146341463414, + "grad_norm": 0.09928189867259862, + "kl": 0.0399169921875, + "learning_rate": 2.893036797858861e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1370 + }, + { + "completion_length": 3166.8333740234375, + "epoch": 0.20899390243902438, + "grad_norm": 0.043874198962747656, + "kl": 0.0498046875, + "learning_rate": 2.892740596575651e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1371 + }, + { + "completion_length": 2504.0000610351562, + "epoch": 0.20914634146341463, + "grad_norm": 0.05882735019570067, + "kl": 0.040283203125, + "learning_rate": 2.892444000946593e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1372 + }, + { + "completion_length": 2499.83349609375, + "epoch": 0.20929878048780487, + "grad_norm": 0.11035127822292413, + "kl": 0.0640869140625, + "learning_rate": 2.8921470110556668e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1373 + }, + { + "completion_length": 2561.666748046875, + "epoch": 0.2094512195121951, + "grad_norm": 0.06586585027562132, + "kl": 0.0531005859375, + "learning_rate": 2.8918496269869626e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1374 + }, + { + "completion_length": 2654.0, + "epoch": 0.20960365853658536, + "grad_norm": 0.06071539435518065, + "kl": 0.0599365234375, + "learning_rate": 2.891551848824683e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1375 + }, + { + "completion_length": 2931.666748046875, + "epoch": 0.2097560975609756, + "grad_norm": 0.07494202542016966, + "kl": 0.0592041015625, + "learning_rate": 2.8912536766531423e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1376 + }, + { + "completion_length": 3253.3333740234375, + "epoch": 0.20990853658536585, + "grad_norm": 0.04608111248664155, + "kl": 0.0469970703125, + "learning_rate": 2.8909551105567657e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1377 + }, + { + "completion_length": 2187.8333740234375, + "epoch": 0.2100609756097561, + "grad_norm": 487.8968808959556, + "kl": 12.706298828125, + "learning_rate": 2.8906561506200905e-06, + "loss": 0.5099, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1378 + }, + { + "completion_length": 2688.0, + "epoch": 0.21021341463414633, + "grad_norm": 0.04771651151927936, + "kl": 0.056396484375, + "learning_rate": 2.8903567969277652e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1379 + }, + { + "completion_length": 2120.166748046875, + "epoch": 0.21036585365853658, + "grad_norm": 0.14647076783729354, + "kl": 0.052490234375, + "learning_rate": 2.8900570495645504e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1380 + }, + { + "completion_length": 2473.166748046875, + "epoch": 0.21051829268292682, + "grad_norm": 0.07071656794816251, + "kl": 0.05126953125, + "learning_rate": 2.889756908615317e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1381 + }, + { + "completion_length": 2214.666748046875, + "epoch": 0.21067073170731707, + "grad_norm": 0.09169273208024373, + "kl": 0.04638671875, + "learning_rate": 2.889456374165049e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1382 + }, + { + "completion_length": 1874.0000610351562, + "epoch": 0.2108231707317073, + "grad_norm": 5.049598062194576, + "kl": 0.117919921875, + "learning_rate": 2.8891554462988398e-06, + "loss": 0.0047, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1383 + }, + { + "completion_length": 1689.1667175292969, + "epoch": 0.21097560975609755, + "grad_norm": 0.1010372070952937, + "kl": 0.046875, + "learning_rate": 2.8888541251018963e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1384 + }, + { + "completion_length": 2539.0000610351562, + "epoch": 0.2111280487804878, + "grad_norm": 0.3722860254272298, + "kl": 0.0728759765625, + "learning_rate": 2.8885524106595356e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1385 + }, + { + "completion_length": 2029.666748046875, + "epoch": 0.21128048780487804, + "grad_norm": 0.06634368722794133, + "kl": 0.041748046875, + "learning_rate": 2.8882503030571847e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1386 + }, + { + "completion_length": 1681.8333740234375, + "epoch": 0.2114329268292683, + "grad_norm": 0.12039791952721, + "kl": 0.067138671875, + "learning_rate": 2.887947802380385e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1387 + }, + { + "completion_length": 2715.3333740234375, + "epoch": 0.21158536585365853, + "grad_norm": 0.05865690993315692, + "kl": 0.047119140625, + "learning_rate": 2.887644908714788e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1388 + }, + { + "completion_length": 3449.166748046875, + "epoch": 0.21173780487804877, + "grad_norm": 0.049920821562390194, + "kl": 0.0518798828125, + "learning_rate": 2.887341622146155e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1389 + }, + { + "completion_length": 1999.3333740234375, + "epoch": 0.21189024390243902, + "grad_norm": 0.11964438501740657, + "kl": 0.057861328125, + "learning_rate": 2.88703794276036e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1390 + }, + { + "completion_length": 2375.6666870117188, + "epoch": 0.21204268292682926, + "grad_norm": 0.061971837365097336, + "kl": 0.04638671875, + "learning_rate": 2.8867338706433885e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1391 + }, + { + "completion_length": 2018.0, + "epoch": 0.2121951219512195, + "grad_norm": 0.07592263148846702, + "kl": 0.055908203125, + "learning_rate": 2.8864294058813364e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1392 + }, + { + "completion_length": 3629.166748046875, + "epoch": 0.21234756097560975, + "grad_norm": 0.03493412383590853, + "kl": 0.03460693359375, + "learning_rate": 2.886124548560411e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1393 + }, + { + "completion_length": 3547.166748046875, + "epoch": 0.2125, + "grad_norm": 0.05939278976526522, + "kl": 0.0579833984375, + "learning_rate": 2.88581929876693e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1394 + }, + { + "completion_length": 2080.5, + "epoch": 0.21265243902439024, + "grad_norm": 0.10666340902616331, + "kl": 0.0513916015625, + "learning_rate": 2.8855136565873243e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1395 + }, + { + "completion_length": 2130.0, + "epoch": 0.21280487804878048, + "grad_norm": 0.10203305522918728, + "kl": 0.05224609375, + "learning_rate": 2.8852076221081333e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1396 + }, + { + "completion_length": 2136.166748046875, + "epoch": 0.21295731707317073, + "grad_norm": 1.3758937958017807, + "kl": 0.07373046875, + "learning_rate": 2.88490119541601e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1397 + }, + { + "completion_length": 1395.5000610351562, + "epoch": 0.21310975609756097, + "grad_norm": 0.11040385427266945, + "kl": 0.071044921875, + "learning_rate": 2.8845943765977162e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1398 + }, + { + "completion_length": 2574.3333740234375, + "epoch": 0.21326219512195121, + "grad_norm": 0.08201731512553738, + "kl": 0.050048828125, + "learning_rate": 2.8842871657401264e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1399 + }, + { + "completion_length": 3225.8333740234375, + "epoch": 0.21341463414634146, + "grad_norm": 0.08557778497418296, + "kl": 0.070068359375, + "learning_rate": 2.883979562930225e-06, + "loss": 0.0028, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1400 + }, + { + "completion_length": 3286.8333740234375, + "epoch": 0.2135670731707317, + "grad_norm": 0.07773095673065647, + "kl": 0.073486328125, + "learning_rate": 2.883671568255108e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1401 + }, + { + "completion_length": 2021.0, + "epoch": 0.21371951219512195, + "grad_norm": 0.12699773619381063, + "kl": 0.072265625, + "learning_rate": 2.8833631818019818e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1402 + }, + { + "completion_length": 3296.83349609375, + "epoch": 0.2138719512195122, + "grad_norm": 0.1045397749139122, + "kl": 0.05810546875, + "learning_rate": 2.883054403658165e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1403 + }, + { + "completion_length": 2907.83349609375, + "epoch": 0.21402439024390243, + "grad_norm": 0.12718250846823928, + "kl": 0.0626220703125, + "learning_rate": 2.8827452339110856e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1404 + }, + { + "completion_length": 3397.8333740234375, + "epoch": 0.21417682926829268, + "grad_norm": 0.07162701038214833, + "kl": 0.07568359375, + "learning_rate": 2.882435672648283e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1405 + }, + { + "completion_length": 2792.3333740234375, + "epoch": 0.21432926829268292, + "grad_norm": 0.0768355290538249, + "kl": 0.0738525390625, + "learning_rate": 2.8821257199574082e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1406 + }, + { + "completion_length": 2841.3333740234375, + "epoch": 0.21448170731707317, + "grad_norm": 0.08946297030254306, + "kl": 0.076904296875, + "learning_rate": 2.8818153759262213e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1407 + }, + { + "completion_length": 3277.8333740234375, + "epoch": 0.2146341463414634, + "grad_norm": 0.059395347909924325, + "kl": 0.051025390625, + "learning_rate": 2.8815046406425954e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1408 + }, + { + "completion_length": 2900.5001220703125, + "epoch": 0.21478658536585366, + "grad_norm": 0.06048829329717814, + "kl": 0.054443359375, + "learning_rate": 2.8811935141945127e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1409 + }, + { + "completion_length": 2216.166748046875, + "epoch": 0.2149390243902439, + "grad_norm": 0.1722824030269154, + "kl": 0.08935546875, + "learning_rate": 2.8808819966700667e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1410 + }, + { + "completion_length": 1389.0000610351562, + "epoch": 0.21509146341463414, + "grad_norm": 0.10698176859194916, + "kl": 0.07373046875, + "learning_rate": 2.8805700881574616e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1411 + }, + { + "completion_length": 2957.6666870117188, + "epoch": 0.2152439024390244, + "grad_norm": 0.11053628867044893, + "kl": 0.04595947265625, + "learning_rate": 2.8802577887450124e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1412 + }, + { + "completion_length": 2033.5000610351562, + "epoch": 0.21539634146341463, + "grad_norm": 0.08763627911852491, + "kl": 0.0625, + "learning_rate": 2.8799450985211456e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1413 + }, + { + "completion_length": 2223.166748046875, + "epoch": 0.21554878048780488, + "grad_norm": 0.09226207626163647, + "kl": 0.088623046875, + "learning_rate": 2.8796320175743963e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1414 + }, + { + "completion_length": 2208.666748046875, + "epoch": 0.21570121951219512, + "grad_norm": 0.07691822553740503, + "kl": 0.05255126953125, + "learning_rate": 2.8793185459934116e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1415 + }, + { + "completion_length": 2529.5001220703125, + "epoch": 0.21585365853658536, + "grad_norm": 0.1261852122660739, + "kl": 0.0565185546875, + "learning_rate": 2.8790046838669493e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1416 + }, + { + "completion_length": 2473.5000610351562, + "epoch": 0.2160060975609756, + "grad_norm": 0.06617112239842877, + "kl": 0.064208984375, + "learning_rate": 2.8786904312838778e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1417 + }, + { + "completion_length": 2266.166748046875, + "epoch": 0.21615853658536585, + "grad_norm": 0.1725944874980742, + "kl": 0.051513671875, + "learning_rate": 2.8783757883331754e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1418 + }, + { + "completion_length": 2038.166748046875, + "epoch": 0.2163109756097561, + "grad_norm": 0.08221943944220256, + "kl": 0.0693359375, + "learning_rate": 2.8780607551039314e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1419 + }, + { + "completion_length": 2299.5000610351562, + "epoch": 0.21646341463414634, + "grad_norm": 0.8623364337867355, + "kl": 0.05419921875, + "learning_rate": 2.877745331685345e-06, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1420 + }, + { + "completion_length": 1758.166748046875, + "epoch": 0.21661585365853658, + "grad_norm": 0.10511803053715753, + "kl": 0.0640869140625, + "learning_rate": 2.8774295181667273e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1421 + }, + { + "completion_length": 2125.6666870117188, + "epoch": 0.21676829268292683, + "grad_norm": 0.09817036496159268, + "kl": 0.0556640625, + "learning_rate": 2.8771133146374983e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1422 + }, + { + "completion_length": 2031.0, + "epoch": 0.21692073170731707, + "grad_norm": 0.10045291412765192, + "kl": 0.060546875, + "learning_rate": 2.8767967211871896e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1423 + }, + { + "completion_length": 2254.166748046875, + "epoch": 0.21707317073170732, + "grad_norm": 0.09477347142599858, + "kl": 0.070068359375, + "learning_rate": 2.876479737905442e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1424 + }, + { + "completion_length": 1510.3333740234375, + "epoch": 0.21722560975609756, + "grad_norm": 0.17806621299792338, + "kl": 0.0716552734375, + "learning_rate": 2.8761623648820077e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1425 + }, + { + "completion_length": 1826.6666870117188, + "epoch": 0.2173780487804878, + "grad_norm": 0.08533629066817464, + "kl": 0.05615234375, + "learning_rate": 2.875844602206749e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1426 + }, + { + "completion_length": 1730.166748046875, + "epoch": 0.21753048780487805, + "grad_norm": 1.5674979498524628, + "kl": 0.07177734375, + "learning_rate": 2.8755264499696384e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1427 + }, + { + "completion_length": 1330.8333740234375, + "epoch": 0.2176829268292683, + "grad_norm": 0.11862655480717958, + "kl": 0.072021484375, + "learning_rate": 2.875207908260758e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1428 + }, + { + "completion_length": 1396.8333740234375, + "epoch": 0.21783536585365854, + "grad_norm": 0.1114300354240298, + "kl": 0.0810546875, + "learning_rate": 2.8748889771703023e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1429 + }, + { + "completion_length": 1749.666748046875, + "epoch": 0.21798780487804878, + "grad_norm": 0.08124681841914991, + "kl": 0.05712890625, + "learning_rate": 2.8745696567885733e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1430 + }, + { + "completion_length": 1837.166748046875, + "epoch": 0.21814024390243902, + "grad_norm": 0.121245884407972, + "kl": 0.074951171875, + "learning_rate": 2.8742499472059857e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1431 + }, + { + "completion_length": 2926.0, + "epoch": 0.21829268292682927, + "grad_norm": 0.10656553371807662, + "kl": 0.064697265625, + "learning_rate": 2.8739298485130627e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1432 + }, + { + "completion_length": 1648.5001220703125, + "epoch": 0.2184451219512195, + "grad_norm": 0.1196963933652611, + "kl": 0.0654296875, + "learning_rate": 2.8736093608004376e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1433 + }, + { + "completion_length": 2530.0000610351562, + "epoch": 0.21859756097560976, + "grad_norm": 0.08823166674683243, + "kl": 0.07080078125, + "learning_rate": 2.8732884841588558e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1434 + }, + { + "completion_length": 1568.3333740234375, + "epoch": 0.21875, + "grad_norm": 0.14063298661315524, + "kl": 0.0870361328125, + "learning_rate": 2.872967218679171e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1435 + }, + { + "completion_length": 1743.0, + "epoch": 0.21890243902439024, + "grad_norm": 0.1342157264414649, + "kl": 0.06298828125, + "learning_rate": 2.8726455644523473e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1436 + }, + { + "completion_length": 923.8333740234375, + "epoch": 0.2190548780487805, + "grad_norm": 0.18491492675011625, + "kl": 0.087646484375, + "learning_rate": 2.872323521569459e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1437 + }, + { + "completion_length": 1037.0000305175781, + "epoch": 0.21920731707317073, + "grad_norm": 0.10199872366591484, + "kl": 0.074951171875, + "learning_rate": 2.8720010901216912e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1438 + }, + { + "completion_length": 2055.3334350585938, + "epoch": 0.21935975609756098, + "grad_norm": 0.06843107070610913, + "kl": 0.06201171875, + "learning_rate": 2.871678270200338e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1439 + }, + { + "completion_length": 2328.5000610351562, + "epoch": 0.21951219512195122, + "grad_norm": 0.12034851451900001, + "kl": 0.07861328125, + "learning_rate": 2.8713550618968034e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1440 + }, + { + "completion_length": 1494.1666717529297, + "epoch": 0.21966463414634146, + "grad_norm": 0.12905081129181156, + "kl": 0.082275390625, + "learning_rate": 2.8710314653026023e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1441 + }, + { + "completion_length": 2514.3333740234375, + "epoch": 0.2198170731707317, + "grad_norm": 0.1101266278167607, + "kl": 0.0625, + "learning_rate": 2.8707074805093594e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1442 + }, + { + "completion_length": 1939.666748046875, + "epoch": 0.21996951219512195, + "grad_norm": 0.10811915202052141, + "kl": 0.074462890625, + "learning_rate": 2.8703831076088082e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1443 + }, + { + "completion_length": 1521.0, + "epoch": 0.2201219512195122, + "grad_norm": 0.20613719360746433, + "kl": 0.06884765625, + "learning_rate": 2.8700583466927935e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1444 + }, + { + "completion_length": 4096.0, + "epoch": 0.22027439024390244, + "grad_norm": 0.06300593957906397, + "kl": 0.034912109375, + "learning_rate": 2.8697331978532687e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1445 + }, + { + "completion_length": 3565.666748046875, + "epoch": 0.22042682926829268, + "grad_norm": 0.04959718696716119, + "kl": 0.0435791015625, + "learning_rate": 2.8694076611822986e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1446 + }, + { + "completion_length": 3106.666748046875, + "epoch": 0.22057926829268293, + "grad_norm": 0.07891195385589338, + "kl": 0.044677734375, + "learning_rate": 2.869081736772056e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1447 + }, + { + "completion_length": 2856.1666870117188, + "epoch": 0.22073170731707317, + "grad_norm": 0.08134926465021143, + "kl": 0.03839111328125, + "learning_rate": 2.8687554247148247e-06, + "loss": 0.0015, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1448 + }, + { + "completion_length": 2928.3333740234375, + "epoch": 0.22088414634146342, + "grad_norm": 0.15117510749786045, + "kl": 0.0562744140625, + "learning_rate": 2.8684287251029986e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1449 + }, + { + "completion_length": 2380.1666870117188, + "epoch": 0.22103658536585366, + "grad_norm": 0.08025194238255239, + "kl": 0.0615234375, + "learning_rate": 2.86810163802908e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1450 + }, + { + "completion_length": 1628.0, + "epoch": 0.2211890243902439, + "grad_norm": 0.1332663675744272, + "kl": 0.0491943359375, + "learning_rate": 2.867774163585681e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1451 + }, + { + "completion_length": 2952.666748046875, + "epoch": 0.22134146341463415, + "grad_norm": 0.0932555861341571, + "kl": 0.0546875, + "learning_rate": 2.8674463018655245e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1452 + }, + { + "completion_length": 2207.3334350585938, + "epoch": 0.2214939024390244, + "grad_norm": 0.06301763797618479, + "kl": 0.0435791015625, + "learning_rate": 2.8671180529614424e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1453 + }, + { + "completion_length": 2823.3333740234375, + "epoch": 0.22164634146341464, + "grad_norm": 0.06873827346697162, + "kl": 0.04766845703125, + "learning_rate": 2.8667894169663773e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1454 + }, + { + "completion_length": 1635.0, + "epoch": 0.22179878048780488, + "grad_norm": 0.139820413699075, + "kl": 0.0767822265625, + "learning_rate": 2.866460393973379e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1455 + }, + { + "completion_length": 2345.8333740234375, + "epoch": 0.22195121951219512, + "grad_norm": 0.07128477282861424, + "kl": 0.06201171875, + "learning_rate": 2.8661309840756093e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1456 + }, + { + "completion_length": 913.3333587646484, + "epoch": 0.22210365853658537, + "grad_norm": 0.13353103330692884, + "kl": 0.05615234375, + "learning_rate": 2.8658011873663383e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1457 + }, + { + "completion_length": 2162.166748046875, + "epoch": 0.2222560975609756, + "grad_norm": 0.07820844802143663, + "kl": 0.0589599609375, + "learning_rate": 2.8654710039389452e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1458 + }, + { + "completion_length": 1934.8333740234375, + "epoch": 0.22240853658536586, + "grad_norm": 0.068238851286638, + "kl": 0.0565185546875, + "learning_rate": 2.8651404338869205e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1459 + }, + { + "completion_length": 2798.5001220703125, + "epoch": 0.2225609756097561, + "grad_norm": 0.09884303404744892, + "kl": 0.0491943359375, + "learning_rate": 2.8648094773038625e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1460 + }, + { + "completion_length": 1488.1666870117188, + "epoch": 0.22271341463414634, + "grad_norm": 0.32254182496805095, + "kl": 0.068115234375, + "learning_rate": 2.8644781342834794e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1461 + }, + { + "completion_length": 2194.3333740234375, + "epoch": 0.2228658536585366, + "grad_norm": 0.08715119305770118, + "kl": 0.0535888671875, + "learning_rate": 2.8641464049195894e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1462 + }, + { + "completion_length": 2293.8333740234375, + "epoch": 0.22301829268292683, + "grad_norm": 0.09340129441921396, + "kl": 0.061767578125, + "learning_rate": 2.863814289306119e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1463 + }, + { + "completion_length": 2567.5001220703125, + "epoch": 0.22317073170731708, + "grad_norm": 0.06616369534885287, + "kl": 0.0423583984375, + "learning_rate": 2.863481787537105e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1464 + }, + { + "completion_length": 1319.3333740234375, + "epoch": 0.22332317073170732, + "grad_norm": 0.1842470314062957, + "kl": 0.0732421875, + "learning_rate": 2.8631488997066933e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1465 + }, + { + "completion_length": 1886.3333740234375, + "epoch": 0.22347560975609757, + "grad_norm": 0.0802972524297949, + "kl": 0.0504150390625, + "learning_rate": 2.862815625909139e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1466 + }, + { + "completion_length": 2101.5, + "epoch": 0.2236280487804878, + "grad_norm": 0.13112067552615572, + "kl": 0.055419921875, + "learning_rate": 2.8624819662388063e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1467 + }, + { + "completion_length": 2279.0001220703125, + "epoch": 0.22378048780487805, + "grad_norm": 0.052168180356325124, + "kl": 0.04052734375, + "learning_rate": 2.8621479207901685e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1468 + }, + { + "completion_length": 2039.3334350585938, + "epoch": 0.2239329268292683, + "grad_norm": 0.08108104903999858, + "kl": 0.0552978515625, + "learning_rate": 2.8618134896578096e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1469 + }, + { + "completion_length": 1994.3334350585938, + "epoch": 0.22408536585365854, + "grad_norm": 0.14747449675071214, + "kl": 0.0635986328125, + "learning_rate": 2.8614786729364205e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1470 + }, + { + "completion_length": 2282.666748046875, + "epoch": 0.22423780487804879, + "grad_norm": 0.10554774829904295, + "kl": 0.0523681640625, + "learning_rate": 2.861143470720803e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1471 + }, + { + "completion_length": 2099.8333740234375, + "epoch": 0.22439024390243903, + "grad_norm": 0.10517736252347672, + "kl": 0.052001953125, + "learning_rate": 2.8608078831058682e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1472 + }, + { + "completion_length": 3038.0, + "epoch": 0.22454268292682927, + "grad_norm": 0.04331417662169798, + "kl": 0.0338134765625, + "learning_rate": 2.8604719101866343e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1473 + }, + { + "completion_length": 2689.8333740234375, + "epoch": 0.22469512195121952, + "grad_norm": 0.09638972770118187, + "kl": 0.0518798828125, + "learning_rate": 2.860135552058231e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1474 + }, + { + "completion_length": 1117.8333435058594, + "epoch": 0.22484756097560976, + "grad_norm": 0.09199338091080361, + "kl": 0.0592041015625, + "learning_rate": 2.8597988088158956e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1475 + }, + { + "completion_length": 4096.0, + "epoch": 0.225, + "grad_norm": 0.03650598075328907, + "kl": 0.03289794921875, + "learning_rate": 2.859461680554975e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1476 + }, + { + "completion_length": 2369.3334350585938, + "epoch": 0.22515243902439025, + "grad_norm": 0.06909756089396062, + "kl": 0.042724609375, + "learning_rate": 2.8591241673709246e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1477 + }, + { + "completion_length": 1701.1666870117188, + "epoch": 0.2253048780487805, + "grad_norm": 0.13661984785348144, + "kl": 0.0518798828125, + "learning_rate": 2.85878626935931e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1478 + }, + { + "completion_length": 2918.6666870117188, + "epoch": 0.22545731707317074, + "grad_norm": 0.0682436340699923, + "kl": 0.04974365234375, + "learning_rate": 2.8584479866158038e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1479 + }, + { + "completion_length": 2318.83349609375, + "epoch": 0.22560975609756098, + "grad_norm": 0.09743126723833952, + "kl": 0.0633544921875, + "learning_rate": 2.8581093192361895e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1480 + }, + { + "completion_length": 2098.666748046875, + "epoch": 0.22576219512195123, + "grad_norm": 0.08037495845885574, + "kl": 0.0595703125, + "learning_rate": 2.857770267316358e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1481 + }, + { + "completion_length": 1923.8333740234375, + "epoch": 0.22591463414634147, + "grad_norm": 0.0854652870609238, + "kl": 0.0577392578125, + "learning_rate": 2.857430830952311e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1482 + }, + { + "completion_length": 2641.5001220703125, + "epoch": 0.2260670731707317, + "grad_norm": 0.062419788990957376, + "kl": 0.0474853515625, + "learning_rate": 2.857091010240156e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1483 + }, + { + "completion_length": 2348.166748046875, + "epoch": 0.22621951219512196, + "grad_norm": 0.10880136409620632, + "kl": 0.0562744140625, + "learning_rate": 2.8567508052761125e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1484 + }, + { + "completion_length": 2912.166748046875, + "epoch": 0.2263719512195122, + "grad_norm": 0.08590953245464919, + "kl": 0.0457763671875, + "learning_rate": 2.856410216156507e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1485 + }, + { + "completion_length": 2563.666748046875, + "epoch": 0.22652439024390245, + "grad_norm": 0.06868568919516366, + "kl": 0.0474853515625, + "learning_rate": 2.856069242977775e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1486 + }, + { + "completion_length": 2576.3333740234375, + "epoch": 0.2266768292682927, + "grad_norm": 0.08111649158198897, + "kl": 0.056396484375, + "learning_rate": 2.8557278858364614e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1487 + }, + { + "completion_length": 1968.1666870117188, + "epoch": 0.22682926829268293, + "grad_norm": 0.10138960056558456, + "kl": 0.053466796875, + "learning_rate": 2.8553861448292185e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1488 + }, + { + "completion_length": 1483.8333435058594, + "epoch": 0.22698170731707318, + "grad_norm": 0.1393172860989587, + "kl": 0.077880859375, + "learning_rate": 2.8550440200528093e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1489 + }, + { + "completion_length": 2979.666748046875, + "epoch": 0.22713414634146342, + "grad_norm": 0.05508305647370043, + "kl": 0.0419921875, + "learning_rate": 2.8547015116041035e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1490 + }, + { + "completion_length": 3070.3333740234375, + "epoch": 0.22728658536585367, + "grad_norm": 0.052814703423680213, + "kl": 0.04150390625, + "learning_rate": 2.8543586195800804e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1491 + }, + { + "completion_length": 1529.8333740234375, + "epoch": 0.2274390243902439, + "grad_norm": 0.1355416020318288, + "kl": 0.0546875, + "learning_rate": 2.854015344077828e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1492 + }, + { + "completion_length": 2672.0, + "epoch": 0.22759146341463415, + "grad_norm": 0.08646494063283183, + "kl": 0.052734375, + "learning_rate": 2.8536716851945423e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1493 + }, + { + "completion_length": 1882.0000610351562, + "epoch": 0.2277439024390244, + "grad_norm": 0.09460303005990185, + "kl": 0.048828125, + "learning_rate": 2.853327643027528e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1494 + }, + { + "completion_length": 2395.3333740234375, + "epoch": 0.22789634146341464, + "grad_norm": 0.059167751527725805, + "kl": 0.0386962890625, + "learning_rate": 2.8529832176741993e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1495 + }, + { + "completion_length": 2308.5001220703125, + "epoch": 0.2280487804878049, + "grad_norm": 0.13187662093473074, + "kl": 0.052978515625, + "learning_rate": 2.852638409232077e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1496 + }, + { + "completion_length": 1944.666748046875, + "epoch": 0.22820121951219513, + "grad_norm": 0.07318648585874077, + "kl": 0.0479736328125, + "learning_rate": 2.8522932177987928e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1497 + }, + { + "completion_length": 1031.5000305175781, + "epoch": 0.22835365853658537, + "grad_norm": 0.1005266951388628, + "kl": 0.0672607421875, + "learning_rate": 2.8519476434720844e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1498 + }, + { + "completion_length": 1931.3333740234375, + "epoch": 0.22850609756097562, + "grad_norm": 0.09182325425896548, + "kl": 0.0548095703125, + "learning_rate": 2.8516016863497995e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1499 + }, + { + "completion_length": 3111.166748046875, + "epoch": 0.22865853658536586, + "grad_norm": 0.04105794997453874, + "kl": 0.031494140625, + "learning_rate": 2.8512553465298938e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1500 + }, + { + "completion_length": 1884.166748046875, + "epoch": 0.2288109756097561, + "grad_norm": 0.11326629618267367, + "kl": 0.044677734375, + "learning_rate": 2.8509086241104303e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1501 + }, + { + "completion_length": 737.5000305175781, + "epoch": 0.22896341463414635, + "grad_norm": 0.12604862478124929, + "kl": 0.0596923828125, + "learning_rate": 2.8505615191895826e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1502 + }, + { + "completion_length": 1246.3333740234375, + "epoch": 0.2291158536585366, + "grad_norm": 0.09166660173922758, + "kl": 0.0482177734375, + "learning_rate": 2.850214031865631e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1503 + }, + { + "completion_length": 2003.666748046875, + "epoch": 0.22926829268292684, + "grad_norm": 0.11729836600346809, + "kl": 0.06591796875, + "learning_rate": 2.8498661622369637e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1504 + }, + { + "completion_length": 1862.1666870117188, + "epoch": 0.22942073170731708, + "grad_norm": 0.10052351589665819, + "kl": 0.06591796875, + "learning_rate": 2.849517910402079e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1505 + }, + { + "completion_length": 1330.0000610351562, + "epoch": 0.22957317073170733, + "grad_norm": 0.1150499160370995, + "kl": 0.06005859375, + "learning_rate": 2.8491692764595807e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1506 + }, + { + "completion_length": 2890.5, + "epoch": 0.22972560975609757, + "grad_norm": 0.15971089727152918, + "kl": 0.0443115234375, + "learning_rate": 2.8488202605081837e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1507 + }, + { + "completion_length": 3035.0001220703125, + "epoch": 0.22987804878048781, + "grad_norm": 0.10523728893445966, + "kl": 0.043212890625, + "learning_rate": 2.848470862646709e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1508 + }, + { + "completion_length": 2593.8333740234375, + "epoch": 0.23003048780487806, + "grad_norm": 0.0779477108054995, + "kl": 0.0472412109375, + "learning_rate": 2.8481210829740865e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1509 + }, + { + "completion_length": 2187.5001220703125, + "epoch": 0.2301829268292683, + "grad_norm": 0.05813622854425164, + "kl": 0.045654296875, + "learning_rate": 2.847770921589354e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1510 + }, + { + "completion_length": 2011.5000610351562, + "epoch": 0.23033536585365855, + "grad_norm": 0.09213340756718094, + "kl": 0.0479736328125, + "learning_rate": 2.8474203785916585e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1511 + }, + { + "completion_length": 2402.166748046875, + "epoch": 0.2304878048780488, + "grad_norm": 0.06061724938423738, + "kl": 0.0399169921875, + "learning_rate": 2.8470694540802527e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1512 + }, + { + "completion_length": 2502.8333740234375, + "epoch": 0.23064024390243903, + "grad_norm": 0.05381827623855534, + "kl": 0.044921875, + "learning_rate": 2.8467181481544996e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1513 + }, + { + "completion_length": 2986.33349609375, + "epoch": 0.23079268292682928, + "grad_norm": 0.057792647750241906, + "kl": 0.04150390625, + "learning_rate": 2.846366460913869e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1514 + }, + { + "completion_length": 1963.8334350585938, + "epoch": 0.23094512195121952, + "grad_norm": 0.07316235617979375, + "kl": 0.0445556640625, + "learning_rate": 2.846014392457939e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1515 + }, + { + "completion_length": 1958.0000610351562, + "epoch": 0.23109756097560977, + "grad_norm": 0.13332478677498125, + "kl": 0.0506591796875, + "learning_rate": 2.8456619428863958e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1516 + }, + { + "completion_length": 3004.666748046875, + "epoch": 0.23125, + "grad_norm": 0.08185512365681538, + "kl": 0.0391845703125, + "learning_rate": 2.8453091122990324e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1517 + }, + { + "completion_length": 2502.5000610351562, + "epoch": 0.23140243902439026, + "grad_norm": 0.08012926465698976, + "kl": 0.054443359375, + "learning_rate": 2.844955900795752e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1518 + }, + { + "completion_length": 2384.0, + "epoch": 0.2315548780487805, + "grad_norm": 0.06653012761684364, + "kl": 0.050537109375, + "learning_rate": 2.844602308476563e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1519 + }, + { + "completion_length": 2678.166748046875, + "epoch": 0.23170731707317074, + "grad_norm": 0.04681926515771133, + "kl": 0.0408935546875, + "learning_rate": 2.8442483354415836e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1520 + }, + { + "completion_length": 2925.0001220703125, + "epoch": 0.231859756097561, + "grad_norm": 0.07400876300843519, + "kl": 0.0386962890625, + "learning_rate": 2.8438939817910386e-06, + "loss": 0.0015, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1521 + }, + { + "completion_length": 2326.166748046875, + "epoch": 0.23201219512195123, + "grad_norm": 0.06141209963148035, + "kl": 0.029052734375, + "learning_rate": 2.8435392476252616e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1522 + }, + { + "completion_length": 1532.166748046875, + "epoch": 0.23216463414634148, + "grad_norm": 0.22311728457989807, + "kl": 0.05810546875, + "learning_rate": 2.843184133044693e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1523 + }, + { + "completion_length": 2396.166748046875, + "epoch": 0.23231707317073172, + "grad_norm": 0.05305763298713461, + "kl": 0.04248046875, + "learning_rate": 2.842828638149881e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1524 + }, + { + "completion_length": 2366.0001220703125, + "epoch": 0.23246951219512196, + "grad_norm": 0.05208739789983062, + "kl": 0.03619384765625, + "learning_rate": 2.8424727630414825e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1525 + }, + { + "completion_length": 2914.8333740234375, + "epoch": 0.2326219512195122, + "grad_norm": 0.13540557308019208, + "kl": 0.03863525390625, + "learning_rate": 2.842116507820261e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1526 + }, + { + "completion_length": 1276.6667175292969, + "epoch": 0.23277439024390245, + "grad_norm": 0.08971697772223587, + "kl": 0.04736328125, + "learning_rate": 2.8417598725870876e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1527 + }, + { + "completion_length": 1623.166748046875, + "epoch": 0.2329268292682927, + "grad_norm": 0.07192128052008448, + "kl": 0.0509033203125, + "learning_rate": 2.841402857442942e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1528 + }, + { + "completion_length": 2552.33349609375, + "epoch": 0.23307926829268294, + "grad_norm": 0.05658801277473649, + "kl": 0.034423828125, + "learning_rate": 2.84104546248891e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1529 + }, + { + "completion_length": 872.3333435058594, + "epoch": 0.23323170731707318, + "grad_norm": 0.12473231812165102, + "kl": 0.0616455078125, + "learning_rate": 2.8406876878261863e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1530 + }, + { + "completion_length": 1724.6666870117188, + "epoch": 0.23338414634146343, + "grad_norm": 0.5144189612939903, + "kl": 0.045166015625, + "learning_rate": 2.8403295335560725e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1531 + }, + { + "completion_length": 2938.3333740234375, + "epoch": 0.23353658536585367, + "grad_norm": 0.07080372097525046, + "kl": 0.034423828125, + "learning_rate": 2.839970999779978e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1532 + }, + { + "completion_length": 1179.1666870117188, + "epoch": 0.23368902439024392, + "grad_norm": 0.07124392621049011, + "kl": 0.048828125, + "learning_rate": 2.8396120865994193e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1533 + }, + { + "completion_length": 2121.166748046875, + "epoch": 0.23384146341463416, + "grad_norm": 0.09000390850089415, + "kl": 0.0548095703125, + "learning_rate": 2.83925279411602e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1534 + }, + { + "completion_length": 2992.666748046875, + "epoch": 0.2339939024390244, + "grad_norm": 0.07994063942422276, + "kl": 0.02581787109375, + "learning_rate": 2.838893122431512e-06, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1535 + }, + { + "completion_length": 3250.3333740234375, + "epoch": 0.23414634146341465, + "grad_norm": 0.07378626005567855, + "kl": 0.02691650390625, + "learning_rate": 2.8385330716477335e-06, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1536 + }, + { + "completion_length": 1192.6666870117188, + "epoch": 0.2342987804878049, + "grad_norm": 0.1008035891818126, + "kl": 0.046142578125, + "learning_rate": 2.838172641866631e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1537 + }, + { + "completion_length": 2288.0000610351562, + "epoch": 0.2344512195121951, + "grad_norm": 0.06745531276647278, + "kl": 0.04461669921875, + "learning_rate": 2.837811833190259e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1538 + }, + { + "completion_length": 1512.3333740234375, + "epoch": 0.23460365853658535, + "grad_norm": 0.07999795092089416, + "kl": 0.0699462890625, + "learning_rate": 2.8374506457207767e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1539 + }, + { + "completion_length": 1717.1666870117188, + "epoch": 0.2347560975609756, + "grad_norm": 0.08711935171641441, + "kl": 0.03472900390625, + "learning_rate": 2.8370890795604523e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1540 + }, + { + "completion_length": 1950.3334350585938, + "epoch": 0.23490853658536584, + "grad_norm": 0.07281379662402686, + "kl": 0.0555419921875, + "learning_rate": 2.8367271348116615e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1541 + }, + { + "completion_length": 786.3333435058594, + "epoch": 0.23506097560975608, + "grad_norm": 0.11485038182795698, + "kl": 0.0675048828125, + "learning_rate": 2.8363648115768875e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1542 + }, + { + "completion_length": 1693.0000610351562, + "epoch": 0.23521341463414633, + "grad_norm": 0.23185409768085222, + "kl": 0.060546875, + "learning_rate": 2.836002109958718e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1543 + }, + { + "completion_length": 735.1666870117188, + "epoch": 0.23536585365853657, + "grad_norm": 0.1160651200376205, + "kl": 0.0545654296875, + "learning_rate": 2.835639030059851e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1544 + }, + { + "completion_length": 1586.8333740234375, + "epoch": 0.23551829268292682, + "grad_norm": 0.13825249053671815, + "kl": 0.0616455078125, + "learning_rate": 2.8352755719830895e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1545 + }, + { + "completion_length": 995.6667175292969, + "epoch": 0.23567073170731706, + "grad_norm": 0.11209487181776316, + "kl": 0.0567626953125, + "learning_rate": 2.8349117358313455e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1546 + }, + { + "completion_length": 1683.0, + "epoch": 0.2358231707317073, + "grad_norm": 0.7081696167628511, + "kl": 0.0635986328125, + "learning_rate": 2.8345475217076364e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1547 + }, + { + "completion_length": 1713.5001220703125, + "epoch": 0.23597560975609755, + "grad_norm": 0.09132552698858702, + "kl": 0.0391845703125, + "learning_rate": 2.834182929715087e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1548 + }, + { + "completion_length": 1946.1667175292969, + "epoch": 0.2361280487804878, + "grad_norm": 0.05707790229124108, + "kl": 0.039794921875, + "learning_rate": 2.8338179599569286e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1549 + }, + { + "completion_length": 1797.166748046875, + "epoch": 0.23628048780487804, + "grad_norm": 0.10721704123244949, + "kl": 0.04345703125, + "learning_rate": 2.8334526125365015e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1550 + }, + { + "completion_length": 1599.3333740234375, + "epoch": 0.23643292682926828, + "grad_norm": 0.123522084834215, + "kl": 0.0498046875, + "learning_rate": 2.8330868875572507e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1551 + }, + { + "completion_length": 1468.8333740234375, + "epoch": 0.23658536585365852, + "grad_norm": 0.09542927843694304, + "kl": 0.0445556640625, + "learning_rate": 2.8327207851227295e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1552 + }, + { + "completion_length": 434.0000305175781, + "epoch": 0.23673780487804877, + "grad_norm": 0.12864568567660445, + "kl": 0.049560546875, + "learning_rate": 2.8323543053365973e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1553 + }, + { + "completion_length": 1847.666748046875, + "epoch": 0.236890243902439, + "grad_norm": 0.11183491342697462, + "kl": 0.058837890625, + "learning_rate": 2.83198744830262e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1554 + }, + { + "completion_length": 867.0, + "epoch": 0.23704268292682926, + "grad_norm": 0.1636010598401806, + "kl": 0.055419921875, + "learning_rate": 2.8316202141246714e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1555 + }, + { + "completion_length": 816.5, + "epoch": 0.2371951219512195, + "grad_norm": 0.10463113338334641, + "kl": 0.055908203125, + "learning_rate": 2.831252602906732e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1556 + }, + { + "completion_length": 1003.3333435058594, + "epoch": 0.23734756097560974, + "grad_norm": 0.10160193531338181, + "kl": 0.0560302734375, + "learning_rate": 2.830884614752888e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1557 + }, + { + "completion_length": 1108.1666870117188, + "epoch": 0.2375, + "grad_norm": 0.10512850763540907, + "kl": 0.047119140625, + "learning_rate": 2.8305162497673325e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1558 + }, + { + "completion_length": 1055.0000610351562, + "epoch": 0.23765243902439023, + "grad_norm": 0.3736357609110812, + "kl": 0.06982421875, + "learning_rate": 2.830147508054367e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1559 + }, + { + "completion_length": 840.6666870117188, + "epoch": 0.23780487804878048, + "grad_norm": 0.091962812928335, + "kl": 0.052490234375, + "learning_rate": 2.829778389718398e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1560 + }, + { + "completion_length": 1687.8333740234375, + "epoch": 0.23795731707317072, + "grad_norm": 0.08534237513078442, + "kl": 0.044921875, + "learning_rate": 2.8294088948639383e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1561 + }, + { + "completion_length": 2216.0, + "epoch": 0.23810975609756097, + "grad_norm": 0.06315864954757149, + "kl": 0.0361328125, + "learning_rate": 2.829039023595609e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1562 + }, + { + "completion_length": 1677.5000610351562, + "epoch": 0.2382621951219512, + "grad_norm": 0.1842130356047631, + "kl": 0.0615234375, + "learning_rate": 2.8286687760181366e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1563 + }, + { + "completion_length": 912.1666870117188, + "epoch": 0.23841463414634145, + "grad_norm": 0.12459785059979651, + "kl": 0.0596923828125, + "learning_rate": 2.828298152236354e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1564 + }, + { + "completion_length": 2027.666748046875, + "epoch": 0.2385670731707317, + "grad_norm": 0.06965748840395204, + "kl": 0.0443115234375, + "learning_rate": 2.8279271523552015e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1565 + }, + { + "completion_length": 1539.5000610351562, + "epoch": 0.23871951219512194, + "grad_norm": 0.07519359222892999, + "kl": 0.049560546875, + "learning_rate": 2.8275557764797255e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1566 + }, + { + "completion_length": 1341.0000610351562, + "epoch": 0.23887195121951219, + "grad_norm": 0.1057112911898819, + "kl": 0.055908203125, + "learning_rate": 2.827184024715078e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1567 + }, + { + "completion_length": 2282.3334350585938, + "epoch": 0.23902439024390243, + "grad_norm": 0.18215819480625528, + "kl": 0.05078125, + "learning_rate": 2.826811897166519e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1568 + }, + { + "completion_length": 1690.666748046875, + "epoch": 0.23917682926829267, + "grad_norm": 0.06716978191180803, + "kl": 0.042724609375, + "learning_rate": 2.8264393939394136e-06, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1569 + }, + { + "completion_length": 1657.3333740234375, + "epoch": 0.23932926829268292, + "grad_norm": 0.06401813555090177, + "kl": 0.03948974609375, + "learning_rate": 2.8260665151392345e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1570 + }, + { + "completion_length": 2036.5000610351562, + "epoch": 0.23948170731707316, + "grad_norm": 0.07745839858846365, + "kl": 0.0438232421875, + "learning_rate": 2.8256932608715592e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1571 + }, + { + "completion_length": 2260.5001220703125, + "epoch": 0.2396341463414634, + "grad_norm": 0.09229865605548838, + "kl": 0.0384521484375, + "learning_rate": 2.8253196312420727e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1572 + }, + { + "completion_length": 1801.8333740234375, + "epoch": 0.23978658536585365, + "grad_norm": 0.07239582952731882, + "kl": 0.0418701171875, + "learning_rate": 2.824945626356566e-06, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1573 + }, + { + "completion_length": 2459.666748046875, + "epoch": 0.2399390243902439, + "grad_norm": 0.12150385187455034, + "kl": 0.0546875, + "learning_rate": 2.824571246320936e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1574 + }, + { + "completion_length": 2704.666748046875, + "epoch": 0.24009146341463414, + "grad_norm": 0.0744597185100899, + "kl": 0.040283203125, + "learning_rate": 2.824196491241186e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1575 + }, + { + "completion_length": 2928.8333740234375, + "epoch": 0.24024390243902438, + "grad_norm": 0.035839778686805365, + "kl": 0.02435302734375, + "learning_rate": 2.8238213612234255e-06, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1576 + }, + { + "completion_length": 2834.3333740234375, + "epoch": 0.24039634146341463, + "grad_norm": 0.05876359340317637, + "kl": 0.04052734375, + "learning_rate": 2.823445856373871e-06, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1577 + }, + { + "completion_length": 1872.8333740234375, + "epoch": 0.24054878048780487, + "grad_norm": 0.07775239021391825, + "kl": 0.0386962890625, + "learning_rate": 2.8230699767988436e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1578 + }, + { + "completion_length": 2560.8333740234375, + "epoch": 0.2407012195121951, + "grad_norm": 0.05075469088693797, + "kl": 0.0369873046875, + "learning_rate": 2.822693722604772e-06, + "loss": 0.0015, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1579 + }, + { + "completion_length": 2194.166748046875, + "epoch": 0.24085365853658536, + "grad_norm": 0.22970449073681, + "kl": 0.0523681640625, + "learning_rate": 2.822317093898189e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1580 + }, + { + "completion_length": 3196.8333740234375, + "epoch": 0.2410060975609756, + "grad_norm": 0.05154032528890223, + "kl": 0.0284423828125, + "learning_rate": 2.821940090785736e-06, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1581 + }, + { + "completion_length": 1546.3334350585938, + "epoch": 0.24115853658536585, + "grad_norm": 0.08922532347032917, + "kl": 0.0523681640625, + "learning_rate": 2.821562713374159e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1582 + }, + { + "completion_length": 2463.5000610351562, + "epoch": 0.2413109756097561, + "grad_norm": 0.04571982872366619, + "kl": 0.04534912109375, + "learning_rate": 2.8211849617703084e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1583 + }, + { + "completion_length": 3043.666748046875, + "epoch": 0.24146341463414633, + "grad_norm": 0.07987764666111259, + "kl": 0.0399169921875, + "learning_rate": 2.8208068360811445e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1584 + }, + { + "completion_length": 3351.666748046875, + "epoch": 0.24161585365853658, + "grad_norm": 0.08382294902297356, + "kl": 0.03466796875, + "learning_rate": 2.8204283364137294e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1585 + }, + { + "completion_length": 2334.0001220703125, + "epoch": 0.24176829268292682, + "grad_norm": 0.0512425798211104, + "kl": 0.037353515625, + "learning_rate": 2.8200494628752342e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1586 + }, + { + "completion_length": 3358.666748046875, + "epoch": 0.24192073170731707, + "grad_norm": 0.05253189776348563, + "kl": 0.03802490234375, + "learning_rate": 2.819670215572934e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1587 + }, + { + "completion_length": 2441.0, + "epoch": 0.2420731707317073, + "grad_norm": 0.05718737933729773, + "kl": 0.05126953125, + "learning_rate": 2.8192905946142097e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1588 + }, + { + "completion_length": 2878.0, + "epoch": 0.24222560975609755, + "grad_norm": 0.11342626182979101, + "kl": 0.04376220703125, + "learning_rate": 2.81891060010655e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1589 + }, + { + "completion_length": 3075.5, + "epoch": 0.2423780487804878, + "grad_norm": 0.05538967153807326, + "kl": 0.033447265625, + "learning_rate": 2.8185302321575475e-06, + "loss": 0.0013, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1590 + }, + { + "completion_length": 2535.8333740234375, + "epoch": 0.24253048780487804, + "grad_norm": 0.06806916985699189, + "kl": 0.052734375, + "learning_rate": 2.8181494908749002e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1591 + }, + { + "completion_length": 3587.166748046875, + "epoch": 0.2426829268292683, + "grad_norm": 0.04228061206833804, + "kl": 0.03955078125, + "learning_rate": 2.8177683763664137e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1592 + }, + { + "completion_length": 1264.5, + "epoch": 0.24283536585365853, + "grad_norm": 0.16216113307984215, + "kl": 0.0616455078125, + "learning_rate": 2.8173868887399974e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1593 + }, + { + "completion_length": 2826.166748046875, + "epoch": 0.24298780487804877, + "grad_norm": 0.3843715561260172, + "kl": 0.0433349609375, + "learning_rate": 2.817005028103668e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1594 + }, + { + "completion_length": 1992.666748046875, + "epoch": 0.24314024390243902, + "grad_norm": 0.050195015418071545, + "kl": 0.0408935546875, + "learning_rate": 2.816622794565546e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1595 + }, + { + "completion_length": 2958.0001220703125, + "epoch": 0.24329268292682926, + "grad_norm": 0.058795428996287584, + "kl": 0.03515625, + "learning_rate": 2.816240188233859e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1596 + }, + { + "completion_length": 3071.5001220703125, + "epoch": 0.2434451219512195, + "grad_norm": 0.02839733894940964, + "kl": 0.0260009765625, + "learning_rate": 2.8158572092169396e-06, + "loss": 0.001, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1597 + }, + { + "completion_length": 2477.5000610351562, + "epoch": 0.24359756097560975, + "grad_norm": 0.062229069032469435, + "kl": 0.04058837890625, + "learning_rate": 2.815473857623226e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1598 + }, + { + "completion_length": 2465.3333740234375, + "epoch": 0.24375, + "grad_norm": 0.05832582761598885, + "kl": 0.041748046875, + "learning_rate": 2.815090133561262e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1599 + }, + { + "completion_length": 3048.5, + "epoch": 0.24390243902439024, + "grad_norm": 0.07055423616865937, + "kl": 0.02630615234375, + "learning_rate": 2.8147060371396953e-06, + "loss": 0.0011, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1600 + }, + { + "completion_length": 2793.6666870117188, + "epoch": 0.24405487804878048, + "grad_norm": 0.2061015912460425, + "kl": 0.0413818359375, + "learning_rate": 2.8143215684672824e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1601 + }, + { + "completion_length": 1541.0, + "epoch": 0.24420731707317073, + "grad_norm": 0.26495818438714347, + "kl": 0.04833984375, + "learning_rate": 2.813936727652882e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1602 + }, + { + "completion_length": 1147.0, + "epoch": 0.24435975609756097, + "grad_norm": 0.16668979964671005, + "kl": 0.0625, + "learning_rate": 2.81355151480546e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1603 + }, + { + "completion_length": 2243.3333740234375, + "epoch": 0.24451219512195121, + "grad_norm": 0.11558438362543438, + "kl": 0.0443115234375, + "learning_rate": 2.813165930034086e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1604 + }, + { + "completion_length": 2245.5001220703125, + "epoch": 0.24466463414634146, + "grad_norm": 2.762511252909962, + "kl": 0.044921875, + "learning_rate": 2.8127799734479374e-06, + "loss": 0.0018, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1605 + }, + { + "completion_length": 1530.5000610351562, + "epoch": 0.2448170731707317, + "grad_norm": 0.1028072847260505, + "kl": 0.0482177734375, + "learning_rate": 2.812393645156294e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1606 + }, + { + "completion_length": 1451.8333740234375, + "epoch": 0.24496951219512195, + "grad_norm": 0.061858243501667624, + "kl": 0.0377197265625, + "learning_rate": 2.8120069452685434e-06, + "loss": 0.0015, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1607 + }, + { + "completion_length": 2309.166748046875, + "epoch": 0.2451219512195122, + "grad_norm": 0.061243185363445536, + "kl": 0.03668212890625, + "learning_rate": 2.8116198738941766e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1608 + }, + { + "completion_length": 1256.3333435058594, + "epoch": 0.24527439024390243, + "grad_norm": 0.10996054889255244, + "kl": 0.05224609375, + "learning_rate": 2.811232431142791e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1609 + }, + { + "completion_length": 1980.666748046875, + "epoch": 0.24542682926829268, + "grad_norm": 0.06691202822244142, + "kl": 0.03955078125, + "learning_rate": 2.8108446171240876e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1610 + }, + { + "completion_length": 1618.0000610351562, + "epoch": 0.24557926829268292, + "grad_norm": 0.09494012491814621, + "kl": 0.068603515625, + "learning_rate": 2.8104564319478744e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1611 + }, + { + "completion_length": 1537.0000610351562, + "epoch": 0.24573170731707317, + "grad_norm": 0.12330692210746658, + "kl": 0.0538330078125, + "learning_rate": 2.8100678757240637e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1612 + }, + { + "completion_length": 1355.1666870117188, + "epoch": 0.2458841463414634, + "grad_norm": 0.1084532052586081, + "kl": 0.051025390625, + "learning_rate": 2.809678948562672e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1613 + }, + { + "completion_length": 821.6666870117188, + "epoch": 0.24603658536585366, + "grad_norm": 0.10935636418361361, + "kl": 0.070556640625, + "learning_rate": 2.8092896505738223e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1614 + }, + { + "completion_length": 1271.8333740234375, + "epoch": 0.2461890243902439, + "grad_norm": 0.08277736350897706, + "kl": 0.048095703125, + "learning_rate": 2.8088999818677418e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1615 + }, + { + "completion_length": 2197.3333740234375, + "epoch": 0.24634146341463414, + "grad_norm": 0.07056705678142794, + "kl": 0.04833984375, + "learning_rate": 2.8085099425547627e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1616 + }, + { + "completion_length": 1704.1666870117188, + "epoch": 0.2464939024390244, + "grad_norm": 0.07146593968961731, + "kl": 0.04150390625, + "learning_rate": 2.8081195327453216e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1617 + }, + { + "completion_length": 1345.5000305175781, + "epoch": 0.24664634146341463, + "grad_norm": 0.13442888111818208, + "kl": 0.0562744140625, + "learning_rate": 2.807728752549962e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1618 + }, + { + "completion_length": 1414.0000610351562, + "epoch": 0.24679878048780488, + "grad_norm": 0.10081635498622414, + "kl": 0.0660400390625, + "learning_rate": 2.8073376020793297e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1619 + }, + { + "completion_length": 918.8333740234375, + "epoch": 0.24695121951219512, + "grad_norm": 0.11306566026539988, + "kl": 0.05615234375, + "learning_rate": 2.8069460814441764e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1620 + }, + { + "completion_length": 1103.3333740234375, + "epoch": 0.24710365853658536, + "grad_norm": 0.08484323920856658, + "kl": 0.048828125, + "learning_rate": 2.8065541907553603e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1621 + }, + { + "completion_length": 1881.3333435058594, + "epoch": 0.2472560975609756, + "grad_norm": 0.09157416904363555, + "kl": 0.054931640625, + "learning_rate": 2.8061619301238414e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1622 + }, + { + "completion_length": 1344.6666870117188, + "epoch": 0.24740853658536585, + "grad_norm": 0.11291325170416595, + "kl": 0.0543212890625, + "learning_rate": 2.8057692996606853e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1623 + }, + { + "completion_length": 2652.666748046875, + "epoch": 0.2475609756097561, + "grad_norm": 1.2446591840814785, + "kl": 0.037841796875, + "learning_rate": 2.8053762994770646e-06, + "loss": 0.0015, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1624 + }, + { + "completion_length": 2198.166748046875, + "epoch": 0.24771341463414634, + "grad_norm": 0.04666559046323101, + "kl": 0.03759765625, + "learning_rate": 2.804982929684254e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1625 + }, + { + "completion_length": 1485.1666870117188, + "epoch": 0.24786585365853658, + "grad_norm": 0.12719342776881457, + "kl": 0.05029296875, + "learning_rate": 2.8045891903936338e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1626 + }, + { + "completion_length": 1278.6667175292969, + "epoch": 0.24801829268292683, + "grad_norm": 0.10006795583556098, + "kl": 0.0518798828125, + "learning_rate": 2.8041950817166886e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1627 + }, + { + "completion_length": 1702.8333740234375, + "epoch": 0.24817073170731707, + "grad_norm": 0.11336494810809793, + "kl": 0.0504150390625, + "learning_rate": 2.803800603765008e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1628 + }, + { + "completion_length": 1495.0, + "epoch": 0.24832317073170732, + "grad_norm": 0.15298998397343633, + "kl": 0.0672607421875, + "learning_rate": 2.803405756650286e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1629 + }, + { + "completion_length": 1528.1666870117188, + "epoch": 0.24847560975609756, + "grad_norm": 0.0855048355654469, + "kl": 0.05322265625, + "learning_rate": 2.803010540484321e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1630 + }, + { + "completion_length": 923.6666870117188, + "epoch": 0.2486280487804878, + "grad_norm": 0.09462900221351327, + "kl": 0.0576171875, + "learning_rate": 2.8026149553790165e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1631 + }, + { + "completion_length": 1180.5, + "epoch": 0.24878048780487805, + "grad_norm": 0.11637531487371108, + "kl": 0.0631103515625, + "learning_rate": 2.8022190014463794e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1632 + }, + { + "completion_length": 942.1667175292969, + "epoch": 0.2489329268292683, + "grad_norm": 0.1255801282972222, + "kl": 0.061767578125, + "learning_rate": 2.8018226787985216e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1633 + }, + { + "completion_length": 1168.0000305175781, + "epoch": 0.24908536585365854, + "grad_norm": 0.2806068197861718, + "kl": 0.0650634765625, + "learning_rate": 2.8014259875476596e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1634 + }, + { + "completion_length": 1805.0000610351562, + "epoch": 0.24923780487804878, + "grad_norm": 1.3332134020989221, + "kl": 0.065185546875, + "learning_rate": 2.801028927806114e-06, + "loss": 0.0026, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 1635 + }, + { + "completion_length": 1802.166748046875, + "epoch": 0.24939024390243902, + "grad_norm": 0.07215727152200599, + "kl": 0.04638671875, + "learning_rate": 2.80063149968631e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1636 + }, + { + "completion_length": 1365.0000610351562, + "epoch": 0.24954268292682927, + "grad_norm": 0.10452934159740575, + "kl": 0.0540771484375, + "learning_rate": 2.800233703300777e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1637 + }, + { + "completion_length": 838.8333435058594, + "epoch": 0.2496951219512195, + "grad_norm": 0.20673042266655173, + "kl": 0.085205078125, + "learning_rate": 2.7998355387621478e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1638 + }, + { + "completion_length": 2132.8334350585938, + "epoch": 0.24984756097560976, + "grad_norm": 0.1127922638709304, + "kl": 0.0601806640625, + "learning_rate": 2.7994370061831607e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1639 + }, + { + "completion_length": 892.3333740234375, + "epoch": 0.25, + "grad_norm": 1.997960403306314, + "kl": 0.0732421875, + "learning_rate": 2.7990381056766585e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1640 + }, + { + "completion_length": 1143.8333740234375, + "epoch": 0.2501524390243902, + "grad_norm": 0.20435714476113895, + "kl": 0.088623046875, + "learning_rate": 2.7986388373555856e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1641 + }, + { + "completion_length": 1855.666748046875, + "epoch": 0.2503048780487805, + "grad_norm": 0.13709203304264203, + "kl": 0.076416015625, + "learning_rate": 2.798239201332994e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1642 + }, + { + "completion_length": 849.3333435058594, + "epoch": 0.2504573170731707, + "grad_norm": 0.13389803867624547, + "kl": 0.069580078125, + "learning_rate": 2.797839197722037e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1643 + }, + { + "completion_length": 1393.1667175292969, + "epoch": 0.250609756097561, + "grad_norm": 0.19819356954595324, + "kl": 0.0859375, + "learning_rate": 2.7974388266359745e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1644 + }, + { + "completion_length": 733.1666870117188, + "epoch": 0.2507621951219512, + "grad_norm": 0.10468119558257262, + "kl": 0.05615234375, + "learning_rate": 2.7970380881881677e-06, + "loss": 0.0022, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1645 + }, + { + "completion_length": 986.3333435058594, + "epoch": 0.25091463414634146, + "grad_norm": 9.452882166978554, + "kl": 0.094482421875, + "learning_rate": 2.796636982492084e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1646 + }, + { + "completion_length": 1103.1666870117188, + "epoch": 0.2510670731707317, + "grad_norm": 1.6494456982155494, + "kl": 0.094970703125, + "learning_rate": 2.7962355096612934e-06, + "loss": 0.0038, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1647 + }, + { + "completion_length": 2018.166748046875, + "epoch": 0.25121951219512195, + "grad_norm": 1.016846632426041, + "kl": 0.073974609375, + "learning_rate": 2.795833669809471e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1648 + }, + { + "completion_length": 2616.1666870117188, + "epoch": 0.25137195121951217, + "grad_norm": 0.09489710194288777, + "kl": 0.0589599609375, + "learning_rate": 2.7954314630503958e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1649 + }, + { + "completion_length": 3463.3333740234375, + "epoch": 0.25152439024390244, + "grad_norm": 0.0787201862378805, + "kl": 0.056884765625, + "learning_rate": 2.7950288894979482e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1650 + }, + { + "completion_length": 3458.3333740234375, + "epoch": 0.25167682926829266, + "grad_norm": 0.11375838252205844, + "kl": 0.060302734375, + "learning_rate": 2.794625949266116e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1651 + }, + { + "completion_length": 2659.3333740234375, + "epoch": 0.25182926829268293, + "grad_norm": 0.09091175158096262, + "kl": 0.0616455078125, + "learning_rate": 2.794222642468989e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1652 + }, + { + "completion_length": 1864.666748046875, + "epoch": 0.25198170731707314, + "grad_norm": 1.85916015297209, + "kl": 0.0728759765625, + "learning_rate": 2.793818969220761e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1653 + }, + { + "completion_length": 3534.0, + "epoch": 0.2521341463414634, + "grad_norm": 0.07129385380219415, + "kl": 0.0582275390625, + "learning_rate": 2.793414929635729e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1654 + }, + { + "completion_length": 3245.0, + "epoch": 0.25228658536585363, + "grad_norm": 0.08453639015564901, + "kl": 0.0574951171875, + "learning_rate": 2.793010523828295e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1655 + }, + { + "completion_length": 2882.666748046875, + "epoch": 0.2524390243902439, + "grad_norm": 0.14078492089268096, + "kl": 0.0797119140625, + "learning_rate": 2.7926057519129634e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1656 + }, + { + "completion_length": 2033.666748046875, + "epoch": 0.2525914634146341, + "grad_norm": 0.13764472142012926, + "kl": 0.070556640625, + "learning_rate": 2.7922006140043436e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1657 + }, + { + "completion_length": 1302.1666870117188, + "epoch": 0.2527439024390244, + "grad_norm": 0.12796458928760931, + "kl": 0.06884765625, + "learning_rate": 2.791795110217147e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1658 + }, + { + "completion_length": 1451.3333435058594, + "epoch": 0.2528963414634146, + "grad_norm": 0.12408936082753416, + "kl": 0.0908203125, + "learning_rate": 2.7913892406661906e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1659 + }, + { + "completion_length": 1538.6666870117188, + "epoch": 0.2530487804878049, + "grad_norm": 0.10798825298488356, + "kl": 0.0606689453125, + "learning_rate": 2.790983005466392e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1660 + }, + { + "completion_length": 1102.3333740234375, + "epoch": 0.2532012195121951, + "grad_norm": 0.3801441245960549, + "kl": 0.064208984375, + "learning_rate": 2.7905764047327762e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1661 + }, + { + "completion_length": 1009.1666870117188, + "epoch": 0.25335365853658537, + "grad_norm": 0.12696357958537036, + "kl": 0.070068359375, + "learning_rate": 2.790169438580469e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1662 + }, + { + "completion_length": 830.3333435058594, + "epoch": 0.2535060975609756, + "grad_norm": 0.37404385802028334, + "kl": 0.107666015625, + "learning_rate": 2.7897621071246996e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1663 + }, + { + "completion_length": 1284.6666870117188, + "epoch": 0.25365853658536586, + "grad_norm": 0.09749728480415301, + "kl": 0.0596923828125, + "learning_rate": 2.7893544104808017e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1664 + }, + { + "completion_length": 1116.5, + "epoch": 0.2538109756097561, + "grad_norm": 0.13853447787625014, + "kl": 0.11376953125, + "learning_rate": 2.7889463487642127e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1665 + }, + { + "completion_length": 1783.1666870117188, + "epoch": 0.25396341463414634, + "grad_norm": 0.11567149838674551, + "kl": 0.095458984375, + "learning_rate": 2.788537922090472e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1666 + }, + { + "completion_length": 1702.5000915527344, + "epoch": 0.25411585365853656, + "grad_norm": 0.13358643441011536, + "kl": 0.078857421875, + "learning_rate": 2.788129130575224e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1667 + }, + { + "completion_length": 2233.6666870117188, + "epoch": 0.25426829268292683, + "grad_norm": 0.1389584286969798, + "kl": 0.088623046875, + "learning_rate": 2.7877199743342145e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1668 + }, + { + "completion_length": 1049.6666870117188, + "epoch": 0.25442073170731705, + "grad_norm": 0.2050261952648383, + "kl": 0.097412109375, + "learning_rate": 2.787310453483294e-06, + "loss": 0.0039, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1669 + }, + { + "completion_length": 1210.3333435058594, + "epoch": 0.2545731707317073, + "grad_norm": 0.1262586065822661, + "kl": 0.091552734375, + "learning_rate": 2.7869005681384152e-06, + "loss": 0.0037, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1670 + }, + { + "completion_length": 1461.8333740234375, + "epoch": 0.25472560975609754, + "grad_norm": 0.11468192775605301, + "kl": 0.0675048828125, + "learning_rate": 2.786490318415636e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1671 + }, + { + "completion_length": 2402.8334350585938, + "epoch": 0.2548780487804878, + "grad_norm": 0.10563196430953836, + "kl": 0.0615234375, + "learning_rate": 2.7860797044311143e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1672 + }, + { + "completion_length": 3864.0001220703125, + "epoch": 0.255030487804878, + "grad_norm": 0.04903100149061681, + "kl": 0.0570068359375, + "learning_rate": 2.7856687263011144e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1673 + }, + { + "completion_length": 1417.5, + "epoch": 0.2551829268292683, + "grad_norm": 0.12894827151488233, + "kl": 0.10009765625, + "learning_rate": 2.785257384142001e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1674 + }, + { + "completion_length": 1401.166748046875, + "epoch": 0.2553353658536585, + "grad_norm": 0.1360832295195552, + "kl": 0.08154296875, + "learning_rate": 2.784845678070244e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1675 + }, + { + "completion_length": 2508.5, + "epoch": 0.2554878048780488, + "grad_norm": 0.12104377651173771, + "kl": 0.0743408203125, + "learning_rate": 2.784433608202415e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1676 + }, + { + "completion_length": 1799.3334350585938, + "epoch": 0.255640243902439, + "grad_norm": 0.12127327860068883, + "kl": 0.06494140625, + "learning_rate": 2.784021174655189e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1677 + }, + { + "completion_length": 1512.5000610351562, + "epoch": 0.2557926829268293, + "grad_norm": 0.10003469657283834, + "kl": 0.0601806640625, + "learning_rate": 2.783608377545344e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1678 + }, + { + "completion_length": 1472.666748046875, + "epoch": 0.2559451219512195, + "grad_norm": 0.12950784455456582, + "kl": 0.064208984375, + "learning_rate": 2.7831952169897613e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1679 + }, + { + "completion_length": 1885.8333740234375, + "epoch": 0.25609756097560976, + "grad_norm": 0.08268230938999808, + "kl": 0.0511474609375, + "learning_rate": 2.7827816931054245e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1680 + }, + { + "completion_length": 1369.0, + "epoch": 0.25625, + "grad_norm": 0.187208128464144, + "kl": 0.0531005859375, + "learning_rate": 2.78236780600942e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1681 + }, + { + "completion_length": 1477.666748046875, + "epoch": 0.25640243902439025, + "grad_norm": 0.1291730636944129, + "kl": 0.059326171875, + "learning_rate": 2.7819535558189377e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1682 + }, + { + "completion_length": 2097.8333740234375, + "epoch": 0.25655487804878047, + "grad_norm": 0.08168743303066828, + "kl": 0.064697265625, + "learning_rate": 2.7815389426512696e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1683 + }, + { + "completion_length": 1357.8333435058594, + "epoch": 0.25670731707317074, + "grad_norm": 0.14713454384269795, + "kl": 0.0609130859375, + "learning_rate": 2.7811239666238117e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1684 + }, + { + "completion_length": 1778.166748046875, + "epoch": 0.25685975609756095, + "grad_norm": 1.032580114129089, + "kl": 0.0560302734375, + "learning_rate": 2.780708627854061e-06, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1685 + }, + { + "completion_length": 1418.3333740234375, + "epoch": 0.2570121951219512, + "grad_norm": 0.22582905114991325, + "kl": 0.077392578125, + "learning_rate": 2.780292926459619e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1686 + }, + { + "completion_length": 1016.6666870117188, + "epoch": 0.25716463414634144, + "grad_norm": 0.10807289366392407, + "kl": 0.0594482421875, + "learning_rate": 2.7798768625581875e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1687 + }, + { + "completion_length": 1734.6666870117188, + "epoch": 0.2573170731707317, + "grad_norm": 0.07651268987585702, + "kl": 0.063720703125, + "learning_rate": 2.7794604362675733e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1688 + }, + { + "completion_length": 1002.1667175292969, + "epoch": 0.25746951219512193, + "grad_norm": 0.09365032772009775, + "kl": 0.043701171875, + "learning_rate": 2.7790436477056856e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1689 + }, + { + "completion_length": 1015.8333740234375, + "epoch": 0.2576219512195122, + "grad_norm": 0.08831219935208774, + "kl": 0.0604248046875, + "learning_rate": 2.7786264969905347e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1690 + }, + { + "completion_length": 960.6666870117188, + "epoch": 0.2577743902439024, + "grad_norm": 0.13605423015368026, + "kl": 0.0640869140625, + "learning_rate": 2.778208984240234e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1691 + }, + { + "completion_length": 1014.8333435058594, + "epoch": 0.2579268292682927, + "grad_norm": 0.1108128267820216, + "kl": 0.0404052734375, + "learning_rate": 2.777791109573e-06, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1692 + }, + { + "completion_length": 1172.6667175292969, + "epoch": 0.2580792682926829, + "grad_norm": 0.10925960481574679, + "kl": 0.052001953125, + "learning_rate": 2.7773728731071514e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1693 + }, + { + "completion_length": 1415.5000610351562, + "epoch": 0.2582317073170732, + "grad_norm": 0.11505254234594531, + "kl": 0.0540771484375, + "learning_rate": 2.7769542749611094e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1694 + }, + { + "completion_length": 2295.0001220703125, + "epoch": 0.2583841463414634, + "grad_norm": 0.12074910563693271, + "kl": 0.0604248046875, + "learning_rate": 2.776535315253397e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1695 + }, + { + "completion_length": 1771.1667175292969, + "epoch": 0.25853658536585367, + "grad_norm": 0.09697113601375959, + "kl": 0.0677490234375, + "learning_rate": 2.7761159941026403e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1696 + }, + { + "completion_length": 698.6666870117188, + "epoch": 0.2586890243902439, + "grad_norm": 0.1149784397276876, + "kl": 0.051513671875, + "learning_rate": 2.7756963116275674e-06, + "loss": 0.0021, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1697 + }, + { + "completion_length": 2444.3333435058594, + "epoch": 0.25884146341463415, + "grad_norm": 0.11461781300604082, + "kl": 0.0447998046875, + "learning_rate": 2.7752762679470086e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1698 + }, + { + "completion_length": 1972.1666870117188, + "epoch": 0.25899390243902437, + "grad_norm": 0.21136122593299175, + "kl": 0.07958984375, + "learning_rate": 2.7748558631798975e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1699 + }, + { + "completion_length": 1991.3334350585938, + "epoch": 0.25914634146341464, + "grad_norm": 0.10322517956788263, + "kl": 0.0625, + "learning_rate": 2.7744350974452685e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1700 + }, + { + "completion_length": 1898.8334350585938, + "epoch": 0.25929878048780486, + "grad_norm": 0.08706838872999896, + "kl": 0.0618896484375, + "learning_rate": 2.774013970862258e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1701 + }, + { + "completion_length": 1477.6666870117188, + "epoch": 0.25945121951219513, + "grad_norm": 0.11245655620286493, + "kl": 0.06884765625, + "learning_rate": 2.7735924835501063e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1702 + }, + { + "completion_length": 2068.5, + "epoch": 0.25960365853658535, + "grad_norm": 0.09491257382315454, + "kl": 0.067626953125, + "learning_rate": 2.773170635628155e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1703 + }, + { + "completion_length": 1744.5000610351562, + "epoch": 0.2597560975609756, + "grad_norm": 0.07714782002743892, + "kl": 0.048095703125, + "learning_rate": 2.772748427215848e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1704 + }, + { + "completion_length": 1549.1666870117188, + "epoch": 0.25990853658536583, + "grad_norm": 0.24301630412344202, + "kl": 0.051025390625, + "learning_rate": 2.7723258584327298e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1705 + }, + { + "completion_length": 1220.8333740234375, + "epoch": 0.2600609756097561, + "grad_norm": 0.13769195638854614, + "kl": 0.05517578125, + "learning_rate": 2.771902929398449e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1706 + }, + { + "completion_length": 2276.3333740234375, + "epoch": 0.2602134146341463, + "grad_norm": 0.08881862919755962, + "kl": 0.04736328125, + "learning_rate": 2.7714796402327552e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1707 + }, + { + "completion_length": 2866.666748046875, + "epoch": 0.2603658536585366, + "grad_norm": 1.5051230327279215, + "kl": 0.0443115234375, + "learning_rate": 2.7710559910555e-06, + "loss": 0.0018, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1708 + }, + { + "completion_length": 1228.0000610351562, + "epoch": 0.2605182926829268, + "grad_norm": 0.08988823408251358, + "kl": 0.0504150390625, + "learning_rate": 2.770631981986637e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1709 + }, + { + "completion_length": 1440.5000610351562, + "epoch": 0.2606707317073171, + "grad_norm": 0.13759730072336265, + "kl": 0.1005859375, + "learning_rate": 2.7702076131462213e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1710 + }, + { + "completion_length": 2104.1666870117188, + "epoch": 0.2608231707317073, + "grad_norm": 0.8928430347073721, + "kl": 0.051513671875, + "learning_rate": 2.7697828846544116e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1711 + }, + { + "completion_length": 1484.0000610351562, + "epoch": 0.26097560975609757, + "grad_norm": 0.13532543871807848, + "kl": 0.05224609375, + "learning_rate": 2.7693577966314664e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1712 + }, + { + "completion_length": 2382.0, + "epoch": 0.2611280487804878, + "grad_norm": 0.08335042035936661, + "kl": 0.0443115234375, + "learning_rate": 2.768932349197746e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1713 + }, + { + "completion_length": 2239.3333740234375, + "epoch": 0.26128048780487806, + "grad_norm": 0.12160496202973861, + "kl": 0.0625, + "learning_rate": 2.7685065424737142e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1714 + }, + { + "completion_length": 1737.5001220703125, + "epoch": 0.2614329268292683, + "grad_norm": 0.12217012233235532, + "kl": 0.074462890625, + "learning_rate": 2.7680803765799353e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1715 + }, + { + "completion_length": 1099.6666870117188, + "epoch": 0.26158536585365855, + "grad_norm": 0.1407734321034382, + "kl": 0.050537109375, + "learning_rate": 2.7676538516370753e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1716 + }, + { + "completion_length": 2525.5000610351562, + "epoch": 0.26173780487804876, + "grad_norm": 0.1032135072694566, + "kl": 0.0631103515625, + "learning_rate": 2.767226967765902e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1717 + }, + { + "completion_length": 1419.5000305175781, + "epoch": 0.26189024390243903, + "grad_norm": 0.12361232885440519, + "kl": 0.0615234375, + "learning_rate": 2.766799725087286e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1718 + }, + { + "completion_length": 1157.3333740234375, + "epoch": 0.26204268292682925, + "grad_norm": 0.1278009821594642, + "kl": 0.070068359375, + "learning_rate": 2.7663721237221965e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1719 + }, + { + "completion_length": 1230.1667175292969, + "epoch": 0.2621951219512195, + "grad_norm": 0.10892704884382737, + "kl": 0.07666015625, + "learning_rate": 2.7659441637917076e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1720 + }, + { + "completion_length": 2320.8333740234375, + "epoch": 0.26234756097560974, + "grad_norm": 0.08922385430943357, + "kl": 0.060791015625, + "learning_rate": 2.7655158454169933e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1721 + }, + { + "completion_length": 925.5000305175781, + "epoch": 0.2625, + "grad_norm": 0.34000529884211317, + "kl": 0.0693359375, + "learning_rate": 2.765087168719329e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1722 + }, + { + "completion_length": 1281.5, + "epoch": 0.2626524390243902, + "grad_norm": 0.08280326874400976, + "kl": 0.0513916015625, + "learning_rate": 2.7646581338200914e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1723 + }, + { + "completion_length": 2365.0000915527344, + "epoch": 0.2628048780487805, + "grad_norm": 0.12494369858653155, + "kl": 0.063232421875, + "learning_rate": 2.76422874084076e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1724 + }, + { + "completion_length": 2019.8334350585938, + "epoch": 0.2629573170731707, + "grad_norm": 0.10205382197290298, + "kl": 0.07763671875, + "learning_rate": 2.7637989899029144e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1725 + }, + { + "completion_length": 1300.8333740234375, + "epoch": 0.263109756097561, + "grad_norm": 0.14396490265314002, + "kl": 0.055908203125, + "learning_rate": 2.763368881128236e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1726 + }, + { + "completion_length": 1565.5000915527344, + "epoch": 0.2632621951219512, + "grad_norm": 1.2237260782944306, + "kl": 0.07177734375, + "learning_rate": 2.762938414638507e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1727 + }, + { + "completion_length": 2682.8333740234375, + "epoch": 0.2634146341463415, + "grad_norm": 0.07210985158439205, + "kl": 0.0599365234375, + "learning_rate": 2.7625075905556117e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1728 + }, + { + "completion_length": 829.6667175292969, + "epoch": 0.2635670731707317, + "grad_norm": 0.1141037753381167, + "kl": 0.0570068359375, + "learning_rate": 2.762076409001535e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1729 + }, + { + "completion_length": 1751.6666870117188, + "epoch": 0.26371951219512196, + "grad_norm": 1.610616962765786, + "kl": 0.0548095703125, + "learning_rate": 2.7616448700983637e-06, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1730 + }, + { + "completion_length": 3223.8333740234375, + "epoch": 0.2638719512195122, + "grad_norm": 0.08017495062372527, + "kl": 0.05908203125, + "learning_rate": 2.761212973968285e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1731 + }, + { + "completion_length": 1818.3333740234375, + "epoch": 0.26402439024390245, + "grad_norm": 0.07620228916856382, + "kl": 0.0491943359375, + "learning_rate": 2.760780720733588e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1732 + }, + { + "completion_length": 2622.0, + "epoch": 0.26417682926829267, + "grad_norm": 0.11561286236023104, + "kl": 0.0587158203125, + "learning_rate": 2.7603481105166616e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1733 + }, + { + "completion_length": 1408.3333740234375, + "epoch": 0.26432926829268294, + "grad_norm": 0.14183778147347423, + "kl": 0.0771484375, + "learning_rate": 2.7599151434399976e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1734 + }, + { + "completion_length": 3191.166748046875, + "epoch": 0.26448170731707316, + "grad_norm": 0.06555611366130264, + "kl": 0.0501708984375, + "learning_rate": 2.759481819626188e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1735 + }, + { + "completion_length": 3043.8333740234375, + "epoch": 0.2646341463414634, + "grad_norm": 0.07613811943845664, + "kl": 0.046630859375, + "learning_rate": 2.7590481391979253e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1736 + }, + { + "completion_length": 3600.666748046875, + "epoch": 0.26478658536585364, + "grad_norm": 2.095495016064075, + "kl": 0.0731201171875, + "learning_rate": 2.7586141022780033e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1737 + }, + { + "completion_length": 3424.5, + "epoch": 0.2649390243902439, + "grad_norm": 0.05297903691171508, + "kl": 0.04052734375, + "learning_rate": 2.758179708989317e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1738 + }, + { + "completion_length": 2409.666748046875, + "epoch": 0.26509146341463413, + "grad_norm": 0.10964141321628737, + "kl": 0.054443359375, + "learning_rate": 2.7577449594548626e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1739 + }, + { + "completion_length": 2910.666748046875, + "epoch": 0.2652439024390244, + "grad_norm": 0.182335455571811, + "kl": 0.05419921875, + "learning_rate": 2.757309853797736e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1740 + }, + { + "completion_length": 2293.3333435058594, + "epoch": 0.2653963414634146, + "grad_norm": 0.12166177567497624, + "kl": 0.06494140625, + "learning_rate": 2.756874392141135e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1741 + }, + { + "completion_length": 3828.3333740234375, + "epoch": 0.2655487804878049, + "grad_norm": 0.04631836118382428, + "kl": 0.0458984375, + "learning_rate": 2.756438574608358e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1742 + }, + { + "completion_length": 2876.8333740234375, + "epoch": 0.2657012195121951, + "grad_norm": 0.23644467759453597, + "kl": 0.0543212890625, + "learning_rate": 2.7560024013228043e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1743 + }, + { + "completion_length": 4096.0, + "epoch": 0.2658536585365854, + "grad_norm": 0.04486671360827439, + "kl": 0.0435791015625, + "learning_rate": 2.755565872407973e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1744 + }, + { + "completion_length": 1730.1666870117188, + "epoch": 0.2660060975609756, + "grad_norm": 1.4455629928473643, + "kl": 0.0830078125, + "learning_rate": 2.755128987987465e-06, + "loss": 0.0033, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1745 + }, + { + "completion_length": 2756.5, + "epoch": 0.26615853658536587, + "grad_norm": 0.15886433663656718, + "kl": 0.0711669921875, + "learning_rate": 2.7546917481849808e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1746 + }, + { + "completion_length": 1888.1666870117188, + "epoch": 0.2663109756097561, + "grad_norm": 0.16857659016448256, + "kl": 0.08349609375, + "learning_rate": 2.7542541531243225e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1747 + }, + { + "completion_length": 1597.0000610351562, + "epoch": 0.26646341463414636, + "grad_norm": 0.14402836247022868, + "kl": 0.07666015625, + "learning_rate": 2.7538162029293933e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1748 + }, + { + "completion_length": 3082.8333740234375, + "epoch": 0.2666158536585366, + "grad_norm": 0.1356885335091537, + "kl": 0.07177734375, + "learning_rate": 2.7533778977241945e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1749 + }, + { + "completion_length": 1731.3333435058594, + "epoch": 0.26676829268292684, + "grad_norm": 0.16163776602124416, + "kl": 0.094970703125, + "learning_rate": 2.752939237632831e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1750 + }, + { + "completion_length": 1667.1666870117188, + "epoch": 0.26692073170731706, + "grad_norm": 0.1634702266843064, + "kl": 0.0804443359375, + "learning_rate": 2.7525002227795054e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1751 + }, + { + "completion_length": 1850.166748046875, + "epoch": 0.26707317073170733, + "grad_norm": 0.14459471902790708, + "kl": 0.0888671875, + "learning_rate": 2.7520608532885228e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1752 + }, + { + "completion_length": 1670.1666870117188, + "epoch": 0.26722560975609755, + "grad_norm": 0.1609372797254638, + "kl": 0.0615234375, + "learning_rate": 2.7516211292842875e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1753 + }, + { + "completion_length": 1628.8333740234375, + "epoch": 0.2673780487804878, + "grad_norm": 0.12856314259505014, + "kl": 0.0859375, + "learning_rate": 2.751181050891305e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1754 + }, + { + "completion_length": 1213.1666870117188, + "epoch": 0.26753048780487804, + "grad_norm": 0.18681924502387692, + "kl": 0.08837890625, + "learning_rate": 2.750740618234181e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1755 + }, + { + "completion_length": 1725.8334350585938, + "epoch": 0.2676829268292683, + "grad_norm": 0.14281774949615078, + "kl": 0.0811767578125, + "learning_rate": 2.75029983143762e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1756 + }, + { + "completion_length": 1640.3334350585938, + "epoch": 0.2678353658536585, + "grad_norm": 0.344656471877829, + "kl": 0.110595703125, + "learning_rate": 2.7498586906264296e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1757 + }, + { + "completion_length": 972.3333435058594, + "epoch": 0.2679878048780488, + "grad_norm": 0.14409553372875966, + "kl": 0.0947265625, + "learning_rate": 2.7494171959255153e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1758 + }, + { + "completion_length": 1041.5000610351562, + "epoch": 0.268140243902439, + "grad_norm": 0.10886791923077849, + "kl": 0.0614013671875, + "learning_rate": 2.7489753474598833e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1759 + }, + { + "completion_length": 804.1666870117188, + "epoch": 0.2682926829268293, + "grad_norm": 0.12503179822372268, + "kl": 0.0728759765625, + "learning_rate": 2.7485331453546407e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1760 + }, + { + "completion_length": 1186.6667175292969, + "epoch": 0.2684451219512195, + "grad_norm": 0.10896915932626346, + "kl": 0.0677490234375, + "learning_rate": 2.7480905897349947e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1761 + }, + { + "completion_length": 1119.0000610351562, + "epoch": 0.2685975609756098, + "grad_norm": 0.241862051000167, + "kl": 0.0670166015625, + "learning_rate": 2.747647680726251e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1762 + }, + { + "completion_length": 902.0000305175781, + "epoch": 0.26875, + "grad_norm": 0.17019455630764477, + "kl": 0.0811767578125, + "learning_rate": 2.747204418453818e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1763 + }, + { + "completion_length": 729.0000305175781, + "epoch": 0.26890243902439026, + "grad_norm": 0.19385128298564977, + "kl": 0.078369140625, + "learning_rate": 2.7467608030432016e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1764 + }, + { + "completion_length": 941.1666870117188, + "epoch": 0.2690548780487805, + "grad_norm": 0.1331013966395425, + "kl": 0.09716796875, + "learning_rate": 2.746316834620009e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1765 + }, + { + "completion_length": 1222.1666870117188, + "epoch": 0.26920731707317075, + "grad_norm": 0.10319328128031735, + "kl": 0.064697265625, + "learning_rate": 2.7458725133099467e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1766 + }, + { + "completion_length": 2515.6666870117188, + "epoch": 0.26935975609756097, + "grad_norm": 0.0546742862152582, + "kl": 0.0523681640625, + "learning_rate": 2.7454278392388225e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1767 + }, + { + "completion_length": 1277.5000610351562, + "epoch": 0.26951219512195124, + "grad_norm": 0.0917183572614672, + "kl": 0.0426025390625, + "learning_rate": 2.744982812532542e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1768 + }, + { + "completion_length": 1910.8333435058594, + "epoch": 0.26966463414634145, + "grad_norm": 0.1006542587456892, + "kl": 0.0693359375, + "learning_rate": 2.744537433317113e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1769 + }, + { + "completion_length": 779.8333435058594, + "epoch": 0.2698170731707317, + "grad_norm": 0.11827788586915812, + "kl": 0.087890625, + "learning_rate": 2.7440917017186405e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1770 + }, + { + "completion_length": 2652.0001220703125, + "epoch": 0.26996951219512194, + "grad_norm": 0.0725132266071387, + "kl": 0.0531005859375, + "learning_rate": 2.7436456178633316e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1771 + }, + { + "completion_length": 1175.3333740234375, + "epoch": 0.2701219512195122, + "grad_norm": 0.1128074317051247, + "kl": 0.080322265625, + "learning_rate": 2.743199181877492e-06, + "loss": 0.0032, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1772 + }, + { + "completion_length": 1817.5000610351562, + "epoch": 0.27027439024390243, + "grad_norm": 0.3175752891431345, + "kl": 0.0625, + "learning_rate": 2.742752393887527e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1773 + }, + { + "completion_length": 1691.0, + "epoch": 0.2704268292682927, + "grad_norm": 0.08366555616193003, + "kl": 0.061767578125, + "learning_rate": 2.7423052540199415e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1774 + }, + { + "completion_length": 2373.166748046875, + "epoch": 0.2705792682926829, + "grad_norm": 0.0692689727340542, + "kl": 0.047607421875, + "learning_rate": 2.7418577624013414e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1775 + }, + { + "completion_length": 1485.0000305175781, + "epoch": 0.2707317073170732, + "grad_norm": 0.09928347920263454, + "kl": 0.068359375, + "learning_rate": 2.7414099191584305e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1776 + }, + { + "completion_length": 874.8333740234375, + "epoch": 0.2708841463414634, + "grad_norm": 0.1969003704031224, + "kl": 0.07275390625, + "learning_rate": 2.740961724418013e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1777 + }, + { + "completion_length": 1014.3333435058594, + "epoch": 0.2710365853658537, + "grad_norm": 0.1069462483648732, + "kl": 0.06494140625, + "learning_rate": 2.7405131783069923e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1778 + }, + { + "completion_length": 1208.0000305175781, + "epoch": 0.2711890243902439, + "grad_norm": 0.0919821920596078, + "kl": 0.0596923828125, + "learning_rate": 2.740064280952372e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1779 + }, + { + "completion_length": 2592.0, + "epoch": 0.27134146341463417, + "grad_norm": 0.0721835206816112, + "kl": 0.0543212890625, + "learning_rate": 2.739615032481253e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1780 + }, + { + "completion_length": 2478.0001220703125, + "epoch": 0.2714939024390244, + "grad_norm": 0.09628266556500904, + "kl": 0.06640625, + "learning_rate": 2.73916543302084e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1781 + }, + { + "completion_length": 848.5000305175781, + "epoch": 0.27164634146341465, + "grad_norm": 0.1375759355490649, + "kl": 0.0645751953125, + "learning_rate": 2.7387154826984314e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1782 + }, + { + "completion_length": 2026.5000610351562, + "epoch": 0.27179878048780487, + "grad_norm": 0.07275692158013139, + "kl": 0.0582275390625, + "learning_rate": 2.7382651816414296e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1783 + }, + { + "completion_length": 1593.3333740234375, + "epoch": 0.27195121951219514, + "grad_norm": 0.05831915079010222, + "kl": 0.0399169921875, + "learning_rate": 2.7378145299773337e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1784 + }, + { + "completion_length": 1431.0, + "epoch": 0.27210365853658536, + "grad_norm": 0.09556626117121331, + "kl": 0.052978515625, + "learning_rate": 2.737363527833744e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1785 + }, + { + "completion_length": 1392.6667175292969, + "epoch": 0.27225609756097563, + "grad_norm": 0.11290220435489613, + "kl": 0.0484619140625, + "learning_rate": 2.7369121753383573e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1786 + }, + { + "completion_length": 799.8333740234375, + "epoch": 0.27240853658536585, + "grad_norm": 0.11195269195165014, + "kl": 0.0499267578125, + "learning_rate": 2.736460472618973e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1787 + }, + { + "completion_length": 1197.8333740234375, + "epoch": 0.2725609756097561, + "grad_norm": 0.4243839852504839, + "kl": 0.05419921875, + "learning_rate": 2.7360084198034864e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1788 + }, + { + "completion_length": 1266.1666870117188, + "epoch": 0.27271341463414633, + "grad_norm": 0.09675840285768393, + "kl": 0.0538330078125, + "learning_rate": 2.7355560170198944e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1789 + }, + { + "completion_length": 1309.3333740234375, + "epoch": 0.2728658536585366, + "grad_norm": 0.13129960719008354, + "kl": 0.0638427734375, + "learning_rate": 2.735103264396292e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1790 + }, + { + "completion_length": 1629.8334350585938, + "epoch": 0.2730182926829268, + "grad_norm": 0.16592122895918618, + "kl": 0.071533203125, + "learning_rate": 2.7346501620608726e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1791 + }, + { + "completion_length": 913.0, + "epoch": 0.2731707317073171, + "grad_norm": 0.15768549342440194, + "kl": 0.05615234375, + "learning_rate": 2.7341967101419303e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1792 + }, + { + "completion_length": 1639.6667175292969, + "epoch": 0.2733231707317073, + "grad_norm": 0.1261496561758696, + "kl": 0.04931640625, + "learning_rate": 2.733742908767856e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1793 + }, + { + "completion_length": 1501.8333435058594, + "epoch": 0.2734756097560976, + "grad_norm": 1.3088237307318153, + "kl": 0.05224609375, + "learning_rate": 2.733288758067142e-06, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1794 + }, + { + "completion_length": 960.1666870117188, + "epoch": 0.2736280487804878, + "grad_norm": 0.1315733872857596, + "kl": 0.06005859375, + "learning_rate": 2.7328342581683777e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1795 + }, + { + "completion_length": 1740.1667175292969, + "epoch": 0.27378048780487807, + "grad_norm": 0.08270174330051523, + "kl": 0.047119140625, + "learning_rate": 2.7323794092002518e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1796 + }, + { + "completion_length": 1293.0, + "epoch": 0.2739329268292683, + "grad_norm": 0.08024158069437243, + "kl": 0.0496826171875, + "learning_rate": 2.731924211291552e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1797 + }, + { + "completion_length": 1668.5001220703125, + "epoch": 0.27408536585365856, + "grad_norm": 0.09304296485617068, + "kl": 0.0504150390625, + "learning_rate": 2.7314686645711648e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1798 + }, + { + "completion_length": 1055.0000610351562, + "epoch": 0.2742378048780488, + "grad_norm": 0.12228887261762514, + "kl": 0.051025390625, + "learning_rate": 2.7310127691680753e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1799 + }, + { + "completion_length": 949.3333435058594, + "epoch": 0.27439024390243905, + "grad_norm": 0.1563553594858076, + "kl": 0.0599365234375, + "learning_rate": 2.730556525211368e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1800 + }, + { + "completion_length": 1038.3333435058594, + "epoch": 0.27454268292682926, + "grad_norm": 0.1067478187696796, + "kl": 0.065185546875, + "learning_rate": 2.730099932830225e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1801 + }, + { + "completion_length": 1655.166748046875, + "epoch": 0.27469512195121953, + "grad_norm": 0.08941043743679554, + "kl": 0.0535888671875, + "learning_rate": 2.7296429921539273e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1802 + }, + { + "completion_length": 2003.166748046875, + "epoch": 0.27484756097560975, + "grad_norm": 0.13902116087048252, + "kl": 0.049072265625, + "learning_rate": 2.729185703311855e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1803 + }, + { + "completion_length": 712.3333435058594, + "epoch": 0.275, + "grad_norm": 0.10870588345075725, + "kl": 0.0498046875, + "learning_rate": 2.728728066433488e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1804 + }, + { + "completion_length": 1983.666748046875, + "epoch": 0.27515243902439024, + "grad_norm": 0.09092388321890912, + "kl": 0.0469970703125, + "learning_rate": 2.728270081648401e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1805 + }, + { + "completion_length": 959.6666870117188, + "epoch": 0.2753048780487805, + "grad_norm": 0.10826070771851462, + "kl": 0.0604248046875, + "learning_rate": 2.7278117490862713e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1806 + }, + { + "completion_length": 948.6666870117188, + "epoch": 0.2754573170731707, + "grad_norm": 0.21672089036360898, + "kl": 0.0692138671875, + "learning_rate": 2.7273530688768726e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1807 + }, + { + "completion_length": 1595.5, + "epoch": 0.275609756097561, + "grad_norm": 0.11736622850213141, + "kl": 0.0408935546875, + "learning_rate": 2.726894041150077e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1808 + }, + { + "completion_length": 1780.666748046875, + "epoch": 0.2757621951219512, + "grad_norm": 0.0805000810101791, + "kl": 0.051513671875, + "learning_rate": 2.726434666035855e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1809 + }, + { + "completion_length": 2604.1666870117188, + "epoch": 0.2759146341463415, + "grad_norm": 0.11396086549274667, + "kl": 0.05517578125, + "learning_rate": 2.725974943664276e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1810 + }, + { + "completion_length": 1416.3333740234375, + "epoch": 0.2760670731707317, + "grad_norm": 0.0756218594316815, + "kl": 0.03955078125, + "learning_rate": 2.725514874165508e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1811 + }, + { + "completion_length": 2330.8334350585938, + "epoch": 0.276219512195122, + "grad_norm": 0.31519841026430634, + "kl": 0.06103515625, + "learning_rate": 2.7250544576698174e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1812 + }, + { + "completion_length": 1083.6666870117188, + "epoch": 0.2763719512195122, + "grad_norm": 0.1383419530298287, + "kl": 0.076904296875, + "learning_rate": 2.724593694307567e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1813 + }, + { + "completion_length": 3016.6666870117188, + "epoch": 0.27652439024390246, + "grad_norm": 0.09995106761194071, + "kl": 0.053955078125, + "learning_rate": 2.724132584209219e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1814 + }, + { + "completion_length": 1766.6667175292969, + "epoch": 0.2766768292682927, + "grad_norm": 0.10541777008107683, + "kl": 0.0465087890625, + "learning_rate": 2.7236711275053346e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1815 + }, + { + "completion_length": 2290.0000610351562, + "epoch": 0.27682926829268295, + "grad_norm": 0.06933618897779065, + "kl": 0.052978515625, + "learning_rate": 2.7232093243265727e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1816 + }, + { + "completion_length": 1511.5, + "epoch": 0.27698170731707317, + "grad_norm": 0.10659824974935293, + "kl": 0.0545654296875, + "learning_rate": 2.7227471748036894e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1817 + }, + { + "completion_length": 1147.1667175292969, + "epoch": 0.27713414634146344, + "grad_norm": 0.1309323323776677, + "kl": 0.0657958984375, + "learning_rate": 2.722284679067539e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1818 + }, + { + "completion_length": 1014.3333435058594, + "epoch": 0.27728658536585366, + "grad_norm": 0.10089954985960463, + "kl": 0.065185546875, + "learning_rate": 2.721821837249076e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1819 + }, + { + "completion_length": 1798.0000610351562, + "epoch": 0.2774390243902439, + "grad_norm": 0.12287700598333261, + "kl": 0.0638427734375, + "learning_rate": 2.7213586494793492e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1820 + }, + { + "completion_length": 875.1666870117188, + "epoch": 0.27759146341463414, + "grad_norm": 0.1610243605631769, + "kl": 0.070068359375, + "learning_rate": 2.720895115889508e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1821 + }, + { + "completion_length": 1690.3334350585938, + "epoch": 0.2777439024390244, + "grad_norm": 0.08041338500773725, + "kl": 0.0433349609375, + "learning_rate": 2.7204312366108e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1822 + }, + { + "completion_length": 832.5000305175781, + "epoch": 0.27789634146341463, + "grad_norm": 0.10754798009453335, + "kl": 0.030029296875, + "learning_rate": 2.7199670117745685e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1823 + }, + { + "completion_length": 1094.3333435058594, + "epoch": 0.2780487804878049, + "grad_norm": 0.16871188579411892, + "kl": 0.06884765625, + "learning_rate": 2.7195024415122565e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1824 + }, + { + "completion_length": 878.5000305175781, + "epoch": 0.2782012195121951, + "grad_norm": 0.08011579451863175, + "kl": 0.0543212890625, + "learning_rate": 2.7190375259554042e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1825 + }, + { + "completion_length": 1355.6666870117188, + "epoch": 0.2783536585365854, + "grad_norm": 0.13210000464975638, + "kl": 0.0556640625, + "learning_rate": 2.7185722652356494e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1826 + }, + { + "completion_length": 1120.5, + "epoch": 0.2785060975609756, + "grad_norm": 0.11447038625163163, + "kl": 0.0472412109375, + "learning_rate": 2.7181066594847273e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1827 + }, + { + "completion_length": 1142.5, + "epoch": 0.2786585365853659, + "grad_norm": 0.1268066398585819, + "kl": 0.0494384765625, + "learning_rate": 2.7176407088344726e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1828 + }, + { + "completion_length": 2253.1666870117188, + "epoch": 0.2788109756097561, + "grad_norm": 0.057135751552685514, + "kl": 0.0413818359375, + "learning_rate": 2.717174413416815e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1829 + }, + { + "completion_length": 822.6666870117188, + "epoch": 0.27896341463414637, + "grad_norm": 0.10721952430340886, + "kl": 0.0562744140625, + "learning_rate": 2.716707773363783e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1830 + }, + { + "completion_length": 761.5000305175781, + "epoch": 0.2791158536585366, + "grad_norm": 0.18564079952826062, + "kl": 0.07470703125, + "learning_rate": 2.7162407888075045e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1831 + }, + { + "completion_length": 1641.8333435058594, + "epoch": 0.27926829268292686, + "grad_norm": 0.088176120437309, + "kl": 0.04541015625, + "learning_rate": 2.715773459880202e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1832 + }, + { + "completion_length": 2203.8333740234375, + "epoch": 0.27942073170731707, + "grad_norm": 1.6244548709473856, + "kl": 0.0511474609375, + "learning_rate": 2.715305786714197e-06, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1833 + }, + { + "completion_length": 984.3333740234375, + "epoch": 0.27957317073170734, + "grad_norm": 1.97116920197145, + "kl": 0.0472412109375, + "learning_rate": 2.714837769441908e-06, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1834 + }, + { + "completion_length": 1463.666748046875, + "epoch": 0.27972560975609756, + "grad_norm": 0.07867885040119783, + "kl": 0.0560302734375, + "learning_rate": 2.714369408195852e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1835 + }, + { + "completion_length": 1143.0000305175781, + "epoch": 0.27987804878048783, + "grad_norm": 0.12136696414067763, + "kl": 0.04345703125, + "learning_rate": 2.7139007031086414e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1836 + }, + { + "completion_length": 1927.3334350585938, + "epoch": 0.28003048780487805, + "grad_norm": 0.14336858388322662, + "kl": 0.0531005859375, + "learning_rate": 2.7134316543129884e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1837 + }, + { + "completion_length": 2157.3333740234375, + "epoch": 0.2801829268292683, + "grad_norm": 0.08787608098527228, + "kl": 0.0498046875, + "learning_rate": 2.7129622619417006e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1838 + }, + { + "completion_length": 1026.1666870117188, + "epoch": 0.28033536585365854, + "grad_norm": 0.16267906370202673, + "kl": 0.0755615234375, + "learning_rate": 2.7124925261276835e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1839 + }, + { + "completion_length": 740.3333435058594, + "epoch": 0.2804878048780488, + "grad_norm": 0.12813864480810683, + "kl": 0.0601806640625, + "learning_rate": 2.7120224470039394e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1840 + }, + { + "completion_length": 715.1666870117188, + "epoch": 0.280640243902439, + "grad_norm": 0.14539495268795835, + "kl": 0.0645751953125, + "learning_rate": 2.7115520247035697e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1841 + }, + { + "completion_length": 1066.6666870117188, + "epoch": 0.2807926829268293, + "grad_norm": 0.10617399284860253, + "kl": 0.0419921875, + "learning_rate": 2.71108125935977e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1842 + }, + { + "completion_length": 922.6666870117188, + "epoch": 0.2809451219512195, + "grad_norm": 0.1261825708468763, + "kl": 0.0570068359375, + "learning_rate": 2.7106101511058355e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1843 + }, + { + "completion_length": 597.8333587646484, + "epoch": 0.2810975609756098, + "grad_norm": 3.3752643494963364, + "kl": 0.093017578125, + "learning_rate": 2.710138700075157e-06, + "loss": 0.0037, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1844 + }, + { + "completion_length": 1118.5000610351562, + "epoch": 0.28125, + "grad_norm": 0.11874866884953311, + "kl": 0.070556640625, + "learning_rate": 2.709666906401224e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1845 + }, + { + "completion_length": 932.6666870117188, + "epoch": 0.2814024390243902, + "grad_norm": 0.10433306128732317, + "kl": 0.0496826171875, + "learning_rate": 2.709194770217621e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1846 + }, + { + "completion_length": 1008.8333740234375, + "epoch": 0.2815548780487805, + "grad_norm": 0.24018849453412208, + "kl": 0.0570068359375, + "learning_rate": 2.7087222916580303e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1847 + }, + { + "completion_length": 2216.666748046875, + "epoch": 0.2817073170731707, + "grad_norm": 0.08952719521890712, + "kl": 0.057373046875, + "learning_rate": 2.7082494708562316e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1848 + }, + { + "completion_length": 2563.5001220703125, + "epoch": 0.281859756097561, + "grad_norm": 0.10423670554841205, + "kl": 0.0667724609375, + "learning_rate": 2.7077763079461014e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1849 + }, + { + "completion_length": 746.1666870117188, + "epoch": 0.2820121951219512, + "grad_norm": 0.3678307789284281, + "kl": 0.083251953125, + "learning_rate": 2.7073028030616128e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1850 + }, + { + "completion_length": 1214.1666870117188, + "epoch": 0.28216463414634146, + "grad_norm": 1.832857971769399, + "kl": 0.0709228515625, + "learning_rate": 2.7068289563368354e-06, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1851 + }, + { + "completion_length": 1438.6667175292969, + "epoch": 0.2823170731707317, + "grad_norm": 0.1443967227962915, + "kl": 0.0712890625, + "learning_rate": 2.706354767905936e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1852 + }, + { + "completion_length": 1133.3333740234375, + "epoch": 0.28246951219512195, + "grad_norm": 0.1286367454364189, + "kl": 0.06640625, + "learning_rate": 2.705880237903179e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1853 + }, + { + "completion_length": 1672.1666870117188, + "epoch": 0.28262195121951217, + "grad_norm": 1.7132459137208593, + "kl": 0.076171875, + "learning_rate": 2.7054053664629227e-06, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1854 + }, + { + "completion_length": 2523.83349609375, + "epoch": 0.28277439024390244, + "grad_norm": 0.09316943976952285, + "kl": 0.062744140625, + "learning_rate": 2.7049301537196258e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1855 + }, + { + "completion_length": 1051.3333435058594, + "epoch": 0.28292682926829266, + "grad_norm": 0.16961330795831675, + "kl": 0.084716796875, + "learning_rate": 2.7044545998078414e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1856 + }, + { + "completion_length": 912.8333435058594, + "epoch": 0.28307926829268293, + "grad_norm": 0.10506354363066281, + "kl": 0.07275390625, + "learning_rate": 2.7039787048622195e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1857 + }, + { + "completion_length": 1839.5000915527344, + "epoch": 0.28323170731707314, + "grad_norm": 0.11143586608169867, + "kl": 0.0489501953125, + "learning_rate": 2.703502469017506e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1858 + }, + { + "completion_length": 3531.0, + "epoch": 0.2833841463414634, + "grad_norm": 0.0684648284849461, + "kl": 0.0545654296875, + "learning_rate": 2.7030258924085457e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1859 + }, + { + "completion_length": 1274.8333435058594, + "epoch": 0.28353658536585363, + "grad_norm": 0.13159891262983162, + "kl": 0.0799560546875, + "learning_rate": 2.702548975170277e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1860 + }, + { + "completion_length": 517.3333435058594, + "epoch": 0.2836890243902439, + "grad_norm": 2.610309007006615, + "kl": 0.14697265625, + "learning_rate": 2.7020717174377366e-06, + "loss": 0.0059, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1861 + }, + { + "completion_length": 1802.6666870117188, + "epoch": 0.2838414634146341, + "grad_norm": 2.376331780932242, + "kl": 0.086181640625, + "learning_rate": 2.701594119346057e-06, + "loss": 0.0034, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 1862 + }, + { + "completion_length": 1884.5, + "epoch": 0.2839939024390244, + "grad_norm": 0.11225525186622726, + "kl": 0.0692138671875, + "learning_rate": 2.7011161810304674e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1863 + }, + { + "completion_length": 1899.6667175292969, + "epoch": 0.2841463414634146, + "grad_norm": 0.154420991658519, + "kl": 0.06787109375, + "learning_rate": 2.7006379026262924e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1864 + }, + { + "completion_length": 2097.0, + "epoch": 0.2842987804878049, + "grad_norm": 0.09355740215872413, + "kl": 0.0537109375, + "learning_rate": 2.7001592842689542e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1865 + }, + { + "completion_length": 1333.0000610351562, + "epoch": 0.2844512195121951, + "grad_norm": 0.2803632001193625, + "kl": 0.094970703125, + "learning_rate": 2.699680326093971e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1866 + }, + { + "completion_length": 1832.6666870117188, + "epoch": 0.28460365853658537, + "grad_norm": 0.21127328768172396, + "kl": 0.09619140625, + "learning_rate": 2.699201028236955e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1867 + }, + { + "completion_length": 718.3333435058594, + "epoch": 0.2847560975609756, + "grad_norm": 0.17062572784635577, + "kl": 0.094482421875, + "learning_rate": 2.6987213908336185e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1868 + }, + { + "completion_length": 558.8333740234375, + "epoch": 0.28490853658536586, + "grad_norm": 0.16259813397361278, + "kl": 0.0830078125, + "learning_rate": 2.6982414140197663e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1869 + }, + { + "completion_length": 2218.3333435058594, + "epoch": 0.2850609756097561, + "grad_norm": 0.14141075747038845, + "kl": 0.08984375, + "learning_rate": 2.6977610979313018e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1870 + }, + { + "completion_length": 2842.83349609375, + "epoch": 0.28521341463414634, + "grad_norm": 0.13757937834408387, + "kl": 0.0802001953125, + "learning_rate": 2.697280442704223e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1871 + }, + { + "completion_length": 2932.0001220703125, + "epoch": 0.28536585365853656, + "grad_norm": 0.1471548174744847, + "kl": 0.0947265625, + "learning_rate": 2.696799448474625e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1872 + }, + { + "completion_length": 807.8333740234375, + "epoch": 0.28551829268292683, + "grad_norm": 0.3133464375969027, + "kl": 0.082763671875, + "learning_rate": 2.696318115378698e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1873 + }, + { + "completion_length": 2812.666748046875, + "epoch": 0.28567073170731705, + "grad_norm": 0.4972771387631647, + "kl": 0.083740234375, + "learning_rate": 2.6958364435527286e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1874 + }, + { + "completion_length": 2662.3333740234375, + "epoch": 0.2858231707317073, + "grad_norm": 0.08161436733025283, + "kl": 0.07568359375, + "learning_rate": 2.6953544331330988e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1875 + }, + { + "completion_length": 1150.5, + "epoch": 0.28597560975609754, + "grad_norm": 0.263436346855398, + "kl": 0.138427734375, + "learning_rate": 2.694872084256287e-06, + "loss": 0.0055, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1876 + }, + { + "completion_length": 2357.3333740234375, + "epoch": 0.2861280487804878, + "grad_norm": 2.2103096029361713, + "kl": 0.1015625, + "learning_rate": 2.6943893970588675e-06, + "loss": 0.0041, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1877 + }, + { + "completion_length": 2152.5000610351562, + "epoch": 0.286280487804878, + "grad_norm": 0.14942652194467684, + "kl": 0.0855712890625, + "learning_rate": 2.69390637167751e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1878 + }, + { + "completion_length": 2733.8333740234375, + "epoch": 0.2864329268292683, + "grad_norm": 0.14613983756693202, + "kl": 0.0609130859375, + "learning_rate": 2.6934230082489806e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1879 + }, + { + "completion_length": 1513.5000610351562, + "epoch": 0.2865853658536585, + "grad_norm": 0.17787414208609648, + "kl": 0.093994140625, + "learning_rate": 2.69293930691014e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1880 + }, + { + "completion_length": 1357.3333740234375, + "epoch": 0.2867378048780488, + "grad_norm": 0.0985601343445324, + "kl": 0.072998046875, + "learning_rate": 2.6924552677979454e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1881 + }, + { + "completion_length": 1438.6667175292969, + "epoch": 0.286890243902439, + "grad_norm": 0.11471065433329433, + "kl": 0.080810546875, + "learning_rate": 2.6919708910494496e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1882 + }, + { + "completion_length": 1270.0, + "epoch": 0.2870426829268293, + "grad_norm": 0.11724582611091879, + "kl": 0.065185546875, + "learning_rate": 2.6914861768018008e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1883 + }, + { + "completion_length": 1468.166748046875, + "epoch": 0.2871951219512195, + "grad_norm": 0.12530158549296153, + "kl": 0.0643310546875, + "learning_rate": 2.691001125192243e-06, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 1884 + }, + { + "completion_length": 1961.1666870117188, + "epoch": 0.28734756097560976, + "grad_norm": 0.07614995934532616, + "kl": 0.056640625, + "learning_rate": 2.6905157363581156e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1885 + }, + { + "completion_length": 2284.166748046875, + "epoch": 0.2875, + "grad_norm": 0.07404324787200497, + "kl": 0.04638671875, + "learning_rate": 2.690030010436853e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1886 + }, + { + "completion_length": 906.8333740234375, + "epoch": 0.28765243902439025, + "grad_norm": 0.20180982081662152, + "kl": 0.109130859375, + "learning_rate": 2.689543947565986e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1887 + }, + { + "completion_length": 2822.8333740234375, + "epoch": 0.28780487804878047, + "grad_norm": 0.10940651719615577, + "kl": 0.0584716796875, + "learning_rate": 2.689057547883139e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1888 + }, + { + "completion_length": 1897.3334350585938, + "epoch": 0.28795731707317074, + "grad_norm": 0.10082422947707777, + "kl": 0.0673828125, + "learning_rate": 2.688570811526035e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1889 + }, + { + "completion_length": 2376.5, + "epoch": 0.28810975609756095, + "grad_norm": 0.08268793100308564, + "kl": 0.063232421875, + "learning_rate": 2.688083738632489e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1890 + }, + { + "completion_length": 1351.3333740234375, + "epoch": 0.2882621951219512, + "grad_norm": 0.22381221399506887, + "kl": 0.0810546875, + "learning_rate": 2.6875963293404137e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1891 + }, + { + "completion_length": 1057.3333740234375, + "epoch": 0.28841463414634144, + "grad_norm": 0.12373724881335137, + "kl": 0.07666015625, + "learning_rate": 2.687108583787815e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1892 + }, + { + "completion_length": 1895.8334350585938, + "epoch": 0.2885670731707317, + "grad_norm": 0.17076907237007644, + "kl": 0.083740234375, + "learning_rate": 2.6866205021127954e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1893 + }, + { + "completion_length": 1682.3334197998047, + "epoch": 0.28871951219512193, + "grad_norm": 0.48398426029893765, + "kl": 0.100830078125, + "learning_rate": 2.6861320844535526e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1894 + }, + { + "completion_length": 1217.8333740234375, + "epoch": 0.2888719512195122, + "grad_norm": 0.1327555458374685, + "kl": 0.065673828125, + "learning_rate": 2.685643330948378e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1895 + }, + { + "completion_length": 1400.6666870117188, + "epoch": 0.2890243902439024, + "grad_norm": 0.10308885844379716, + "kl": 0.0704345703125, + "learning_rate": 2.6851542417356605e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1896 + }, + { + "completion_length": 1858.8333740234375, + "epoch": 0.2891768292682927, + "grad_norm": 0.11622425712289039, + "kl": 0.082275390625, + "learning_rate": 2.6846648169538817e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1897 + }, + { + "completion_length": 1534.8333740234375, + "epoch": 0.2893292682926829, + "grad_norm": 0.12195582340007485, + "kl": 0.07958984375, + "learning_rate": 2.684175056741619e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1898 + }, + { + "completion_length": 1973.0, + "epoch": 0.2894817073170732, + "grad_norm": 0.10331171744262914, + "kl": 0.06103515625, + "learning_rate": 2.683684961237546e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1899 + }, + { + "completion_length": 1901.1666870117188, + "epoch": 0.2896341463414634, + "grad_norm": 0.39186998239007076, + "kl": 0.09130859375, + "learning_rate": 2.683194530580429e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1900 + }, + { + "completion_length": 1644.8333740234375, + "epoch": 0.28978658536585367, + "grad_norm": 0.15987111557816494, + "kl": 0.0670166015625, + "learning_rate": 2.6827037649091313e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1901 + }, + { + "completion_length": 947.0, + "epoch": 0.2899390243902439, + "grad_norm": 2.031540518383375, + "kl": 0.095947265625, + "learning_rate": 2.68221266436261e-06, + "loss": 0.0038, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1902 + }, + { + "completion_length": 1008.0000305175781, + "epoch": 0.29009146341463415, + "grad_norm": 0.10026860873283146, + "kl": 0.0806884765625, + "learning_rate": 2.6817212290799166e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1903 + }, + { + "completion_length": 1130.5, + "epoch": 0.29024390243902437, + "grad_norm": 0.0966628351163042, + "kl": 0.0587158203125, + "learning_rate": 2.6812294592001984e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1904 + }, + { + "completion_length": 1385.6666870117188, + "epoch": 0.29039634146341464, + "grad_norm": 0.12982850226726292, + "kl": 0.0712890625, + "learning_rate": 2.680737354862697e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1905 + }, + { + "completion_length": 1817.8333740234375, + "epoch": 0.29054878048780486, + "grad_norm": 0.11934999366708489, + "kl": 0.04632568359375, + "learning_rate": 2.680244916206749e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1906 + }, + { + "completion_length": 2192.0000915527344, + "epoch": 0.29070121951219513, + "grad_norm": 0.11148962679554524, + "kl": 0.044921875, + "learning_rate": 2.6797521433717842e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1907 + }, + { + "completion_length": 845.0, + "epoch": 0.29085365853658535, + "grad_norm": 1.8931827830328052, + "kl": 0.07958984375, + "learning_rate": 2.67925903649733e-06, + "loss": 0.0032, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1908 + }, + { + "completion_length": 1480.8333740234375, + "epoch": 0.2910060975609756, + "grad_norm": 0.09612447599301742, + "kl": 0.064453125, + "learning_rate": 2.6787655957230048e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1909 + }, + { + "completion_length": 3252.166748046875, + "epoch": 0.29115853658536583, + "grad_norm": 0.07313638856077898, + "kl": 0.056640625, + "learning_rate": 2.678271821188524e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1910 + }, + { + "completion_length": 2451.3333740234375, + "epoch": 0.2913109756097561, + "grad_norm": 0.09201467673602182, + "kl": 0.050048828125, + "learning_rate": 2.6777777130336966e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1911 + }, + { + "completion_length": 2175.1666870117188, + "epoch": 0.2914634146341463, + "grad_norm": 0.09990183016086004, + "kl": 0.0443115234375, + "learning_rate": 2.677283271398427e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1912 + }, + { + "completion_length": 2281.6666870117188, + "epoch": 0.2916158536585366, + "grad_norm": 0.09594132812167377, + "kl": 0.047607421875, + "learning_rate": 2.6767884964227123e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1913 + }, + { + "completion_length": 2184.0001220703125, + "epoch": 0.2917682926829268, + "grad_norm": 0.12500692554471446, + "kl": 0.0526123046875, + "learning_rate": 2.6762933882466452e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1914 + }, + { + "completion_length": 2685.5, + "epoch": 0.2919207317073171, + "grad_norm": 0.14675643670240568, + "kl": 0.0548095703125, + "learning_rate": 2.6757979470104127e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1915 + }, + { + "completion_length": 2517.5001220703125, + "epoch": 0.2920731707317073, + "grad_norm": 0.07772997081817506, + "kl": 0.042724609375, + "learning_rate": 2.6753021728542965e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1916 + }, + { + "completion_length": 2234.0, + "epoch": 0.29222560975609757, + "grad_norm": 0.08545591748146812, + "kl": 0.046630859375, + "learning_rate": 2.6748060659186704e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1917 + }, + { + "completion_length": 1226.3333740234375, + "epoch": 0.2923780487804878, + "grad_norm": 0.10653093125989488, + "kl": 0.062255859375, + "learning_rate": 2.6743096263440054e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1918 + }, + { + "completion_length": 2415.5000610351562, + "epoch": 0.29253048780487806, + "grad_norm": 0.0854053221208336, + "kl": 0.054443359375, + "learning_rate": 2.673812854270865e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1919 + }, + { + "completion_length": 2381.5, + "epoch": 0.2926829268292683, + "grad_norm": 0.08771650356362849, + "kl": 0.055908203125, + "learning_rate": 2.673315749839907e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1920 + }, + { + "completion_length": 2567.3333740234375, + "epoch": 0.29283536585365855, + "grad_norm": 0.09780482643679707, + "kl": 0.0506591796875, + "learning_rate": 2.672818313191883e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1921 + }, + { + "completion_length": 3551.166748046875, + "epoch": 0.29298780487804876, + "grad_norm": 0.039519080505118945, + "kl": 0.0399169921875, + "learning_rate": 2.6723205444676395e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1922 + }, + { + "completion_length": 1670.8333740234375, + "epoch": 0.29314024390243903, + "grad_norm": 0.10290406089703591, + "kl": 0.06005859375, + "learning_rate": 2.671822443808117e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1923 + }, + { + "completion_length": 2231.8333740234375, + "epoch": 0.29329268292682925, + "grad_norm": 2.08220182535506, + "kl": 0.0677490234375, + "learning_rate": 2.6713240113543487e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1924 + }, + { + "completion_length": 2260.666748046875, + "epoch": 0.2934451219512195, + "grad_norm": 0.10964882549684743, + "kl": 0.078125, + "learning_rate": 2.6708252472474638e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1925 + }, + { + "completion_length": 1097.8333740234375, + "epoch": 0.29359756097560974, + "grad_norm": 0.11947545001589337, + "kl": 0.0579833984375, + "learning_rate": 2.670326151628683e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1926 + }, + { + "completion_length": 2251.8334350585938, + "epoch": 0.29375, + "grad_norm": 0.0988072097071032, + "kl": 0.060546875, + "learning_rate": 2.6698267246393225e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1927 + }, + { + "completion_length": 2326.0, + "epoch": 0.2939024390243902, + "grad_norm": 0.24072377601383452, + "kl": 0.067138671875, + "learning_rate": 2.669326966420793e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1928 + }, + { + "completion_length": 651.6666870117188, + "epoch": 0.2940548780487805, + "grad_norm": 0.2322375906075257, + "kl": 0.08642578125, + "learning_rate": 2.6688268771145965e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1929 + }, + { + "completion_length": 2035.666748046875, + "epoch": 0.2942073170731707, + "grad_norm": 0.09293250483832889, + "kl": 0.06494140625, + "learning_rate": 2.6683264568623314e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1930 + }, + { + "completion_length": 1531.6666870117188, + "epoch": 0.294359756097561, + "grad_norm": 0.1488431779810804, + "kl": 0.072998046875, + "learning_rate": 2.6678257058056872e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1931 + }, + { + "completion_length": 1813.0000610351562, + "epoch": 0.2945121951219512, + "grad_norm": 0.11489323908415744, + "kl": 0.076904296875, + "learning_rate": 2.66732462408645e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1932 + }, + { + "completion_length": 870.0, + "epoch": 0.2946646341463415, + "grad_norm": 0.266887742402624, + "kl": 0.099365234375, + "learning_rate": 2.666823211846497e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1933 + }, + { + "completion_length": 1896.5000610351562, + "epoch": 0.2948170731707317, + "grad_norm": 0.1179837336706581, + "kl": 0.092041015625, + "learning_rate": 2.6663214692278002e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1934 + }, + { + "completion_length": 662.8333435058594, + "epoch": 0.29496951219512196, + "grad_norm": 0.18136337407937814, + "kl": 0.078369140625, + "learning_rate": 2.6658193963724243e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1935 + }, + { + "completion_length": 1267.0, + "epoch": 0.2951219512195122, + "grad_norm": 0.12448578170103489, + "kl": 0.074951171875, + "learning_rate": 2.6653169934225295e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1936 + }, + { + "completion_length": 1959.166748046875, + "epoch": 0.29527439024390245, + "grad_norm": 0.0754140638243036, + "kl": 0.06494140625, + "learning_rate": 2.664814260520367e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1937 + }, + { + "completion_length": 2459.5, + "epoch": 0.29542682926829267, + "grad_norm": 0.07158947158604012, + "kl": 0.0487060546875, + "learning_rate": 2.664311197808283e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1938 + }, + { + "completion_length": 1717.166748046875, + "epoch": 0.29557926829268294, + "grad_norm": 0.40667285750736126, + "kl": 0.087646484375, + "learning_rate": 2.6638078054287156e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1939 + }, + { + "completion_length": 1196.8333435058594, + "epoch": 0.29573170731707316, + "grad_norm": 0.23767852608615556, + "kl": 0.089599609375, + "learning_rate": 2.6633040835241987e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1940 + }, + { + "completion_length": 1272.3333435058594, + "epoch": 0.2958841463414634, + "grad_norm": 0.26618548091888444, + "kl": 0.081298828125, + "learning_rate": 2.6628000322373567e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1941 + }, + { + "completion_length": 2936.1666870117188, + "epoch": 0.29603658536585364, + "grad_norm": 0.1162163295244868, + "kl": 0.0621337890625, + "learning_rate": 2.66229565171091e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1942 + }, + { + "completion_length": 995.8333435058594, + "epoch": 0.2961890243902439, + "grad_norm": 0.14295489334506492, + "kl": 0.087890625, + "learning_rate": 2.6617909420876692e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1943 + }, + { + "completion_length": 1939.666748046875, + "epoch": 0.29634146341463413, + "grad_norm": 0.1767040104885622, + "kl": 0.0570068359375, + "learning_rate": 2.661285903510541e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1944 + }, + { + "completion_length": 705.3333740234375, + "epoch": 0.2964939024390244, + "grad_norm": 0.12655744266136715, + "kl": 0.07763671875, + "learning_rate": 2.660780536122523e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1945 + }, + { + "completion_length": 1948.0, + "epoch": 0.2966463414634146, + "grad_norm": 0.10574167538138984, + "kl": 0.053466796875, + "learning_rate": 2.660274840066707e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1946 + }, + { + "completion_length": 1079.3333435058594, + "epoch": 0.2967987804878049, + "grad_norm": 0.1794193401051039, + "kl": 0.077392578125, + "learning_rate": 2.659768815486279e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1947 + }, + { + "completion_length": 2053.5000610351562, + "epoch": 0.2969512195121951, + "grad_norm": 0.11135585969448353, + "kl": 0.0633544921875, + "learning_rate": 2.659262462524515e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1948 + }, + { + "completion_length": 3057.1666870117188, + "epoch": 0.2971036585365854, + "grad_norm": 0.05553198209903559, + "kl": 0.0408935546875, + "learning_rate": 2.658755781324787e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1949 + }, + { + "completion_length": 1265.5, + "epoch": 0.2972560975609756, + "grad_norm": 0.10385035333742193, + "kl": 0.0638427734375, + "learning_rate": 2.6582487720305573e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1950 + }, + { + "completion_length": 2002.666748046875, + "epoch": 0.29740853658536587, + "grad_norm": 0.0927144308690017, + "kl": 0.081298828125, + "learning_rate": 2.657741434785383e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1951 + }, + { + "completion_length": 1477.6667175292969, + "epoch": 0.2975609756097561, + "grad_norm": 0.1705131116037035, + "kl": 0.07666015625, + "learning_rate": 2.6572337697329145e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1952 + }, + { + "completion_length": 3356.166748046875, + "epoch": 0.29771341463414636, + "grad_norm": 0.06065869128604714, + "kl": 0.0374755859375, + "learning_rate": 2.656725777016893e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1953 + }, + { + "completion_length": 1349.8333740234375, + "epoch": 0.2978658536585366, + "grad_norm": 0.23070367490365667, + "kl": 0.0582275390625, + "learning_rate": 2.656217456781153e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1954 + }, + { + "completion_length": 1032.6667175292969, + "epoch": 0.29801829268292684, + "grad_norm": 0.07847217702039859, + "kl": 0.034912109375, + "learning_rate": 2.655708809169623e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1955 + }, + { + "completion_length": 951.5000305175781, + "epoch": 0.29817073170731706, + "grad_norm": 0.21139072635090042, + "kl": 0.0826416015625, + "learning_rate": 2.6551998343263237e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1956 + }, + { + "completion_length": 638.6666870117188, + "epoch": 0.29832317073170733, + "grad_norm": 0.1658776601074471, + "kl": 0.10009765625, + "learning_rate": 2.654690532395367e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1957 + }, + { + "completion_length": 936.5000610351562, + "epoch": 0.29847560975609755, + "grad_norm": 0.09246353041344949, + "kl": 0.0528564453125, + "learning_rate": 2.65418090352096e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1958 + }, + { + "completion_length": 2372.3334350585938, + "epoch": 0.2986280487804878, + "grad_norm": 0.07560572378924865, + "kl": 0.0421142578125, + "learning_rate": 2.6536709478474e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1959 + }, + { + "completion_length": 1954.166748046875, + "epoch": 0.29878048780487804, + "grad_norm": 0.1446382041064352, + "kl": 0.069091796875, + "learning_rate": 2.6531606655190777e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1960 + }, + { + "completion_length": 1459.5000610351562, + "epoch": 0.2989329268292683, + "grad_norm": 0.16504117923528788, + "kl": 0.0587158203125, + "learning_rate": 2.652650056680477e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1961 + }, + { + "completion_length": 522.8333435058594, + "epoch": 0.2990853658536585, + "grad_norm": 2.7274959375210552, + "kl": 0.072998046875, + "learning_rate": 2.6521391214761735e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 1962 + }, + { + "completion_length": 971.8333740234375, + "epoch": 0.2992378048780488, + "grad_norm": 0.13961538669357854, + "kl": 0.0791015625, + "learning_rate": 2.6516278600508347e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1963 + }, + { + "completion_length": 1290.8333740234375, + "epoch": 0.299390243902439, + "grad_norm": 0.10018310042126602, + "kl": 0.0684814453125, + "learning_rate": 2.651116272549222e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1964 + }, + { + "completion_length": 2223.5, + "epoch": 0.2995426829268293, + "grad_norm": 0.08236981876864231, + "kl": 0.047119140625, + "learning_rate": 2.6506043591161874e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1965 + }, + { + "completion_length": 2339.666748046875, + "epoch": 0.2996951219512195, + "grad_norm": 0.0900103171677781, + "kl": 0.04705810546875, + "learning_rate": 2.650092119896677e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1966 + }, + { + "completion_length": 1554.666748046875, + "epoch": 0.2998475609756098, + "grad_norm": 0.1344676764726864, + "kl": 0.065185546875, + "learning_rate": 2.6495795550357266e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1967 + }, + { + "completion_length": 2468.166748046875, + "epoch": 0.3, + "grad_norm": 0.10025113701151087, + "kl": 0.065185546875, + "learning_rate": 2.649066664678467e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1968 + }, + { + "completion_length": 2322.8333740234375, + "epoch": 0.30015243902439026, + "grad_norm": 0.060941759679206085, + "kl": 0.0400390625, + "learning_rate": 2.64855344897012e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1969 + }, + { + "completion_length": 3053.166748046875, + "epoch": 0.3003048780487805, + "grad_norm": 0.03604558592253046, + "kl": 0.031494140625, + "learning_rate": 2.648039908055999e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1970 + }, + { + "completion_length": 2368.3333740234375, + "epoch": 0.30045731707317075, + "grad_norm": 0.1417346810844284, + "kl": 0.060546875, + "learning_rate": 2.6475260420815094e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1971 + }, + { + "completion_length": 1420.8333435058594, + "epoch": 0.30060975609756097, + "grad_norm": 0.13961661692199964, + "kl": 0.071533203125, + "learning_rate": 2.64701185119215e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1972 + }, + { + "completion_length": 1348.0, + "epoch": 0.30076219512195124, + "grad_norm": 0.1097987173334324, + "kl": 0.04931640625, + "learning_rate": 2.6464973355335102e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1973 + }, + { + "completion_length": 3063.0001220703125, + "epoch": 0.30091463414634145, + "grad_norm": 0.08719515335337256, + "kl": 0.0533447265625, + "learning_rate": 2.6459824952512726e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1974 + }, + { + "completion_length": 2741.0, + "epoch": 0.3010670731707317, + "grad_norm": 0.05901132165306076, + "kl": 0.029296875, + "learning_rate": 2.645467330491211e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1975 + }, + { + "completion_length": 4016.166748046875, + "epoch": 0.30121951219512194, + "grad_norm": 0.05612082820740935, + "kl": 0.03680419921875, + "learning_rate": 2.64495184139919e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1976 + }, + { + "completion_length": 2140.8334350585938, + "epoch": 0.3013719512195122, + "grad_norm": 0.08276001543366183, + "kl": 0.050537109375, + "learning_rate": 2.644436028121168e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1977 + }, + { + "completion_length": 1303.3333435058594, + "epoch": 0.30152439024390243, + "grad_norm": 0.13626434492707473, + "kl": 0.056640625, + "learning_rate": 2.643919890803194e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1978 + }, + { + "completion_length": 2067.166748046875, + "epoch": 0.3016768292682927, + "grad_norm": 0.07642705785141314, + "kl": 0.04248046875, + "learning_rate": 2.6434034295914094e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1979 + }, + { + "completion_length": 3139.666748046875, + "epoch": 0.3018292682926829, + "grad_norm": 0.09069336559170925, + "kl": 0.043212890625, + "learning_rate": 2.642886644632047e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1980 + }, + { + "completion_length": 2999.3333740234375, + "epoch": 0.3019817073170732, + "grad_norm": 0.26079671472461863, + "kl": 0.056884765625, + "learning_rate": 2.6423695360714315e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1981 + }, + { + "completion_length": 3364.3333740234375, + "epoch": 0.3021341463414634, + "grad_norm": 0.054873988188537025, + "kl": 0.04388427734375, + "learning_rate": 2.641852104055978e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1982 + }, + { + "completion_length": 3318.666748046875, + "epoch": 0.3022865853658537, + "grad_norm": 0.06576781182900146, + "kl": 0.051025390625, + "learning_rate": 2.641334348732195e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1983 + }, + { + "completion_length": 3852.166748046875, + "epoch": 0.3024390243902439, + "grad_norm": 0.04055911545634046, + "kl": 0.0433349609375, + "learning_rate": 2.640816270246681e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1984 + }, + { + "completion_length": 955.1666870117188, + "epoch": 0.30259146341463417, + "grad_norm": 0.13519287215023298, + "kl": 0.0537109375, + "learning_rate": 2.6402978687461274e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1985 + }, + { + "completion_length": 2325.5000610351562, + "epoch": 0.3027439024390244, + "grad_norm": 0.06155535038094588, + "kl": 0.047119140625, + "learning_rate": 2.6397791443773158e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1986 + }, + { + "completion_length": 3346.3333740234375, + "epoch": 0.30289634146341465, + "grad_norm": 0.09151758347804018, + "kl": 0.0440673828125, + "learning_rate": 2.6392600972871207e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1987 + }, + { + "completion_length": 1609.8333740234375, + "epoch": 0.30304878048780487, + "grad_norm": 0.08987026957844971, + "kl": 0.041015625, + "learning_rate": 2.6387407276225055e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1988 + }, + { + "completion_length": 713.1666870117188, + "epoch": 0.30320121951219514, + "grad_norm": 0.14075953170246294, + "kl": 0.059814453125, + "learning_rate": 2.6382210355305283e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1989 + }, + { + "completion_length": 1348.1666870117188, + "epoch": 0.30335365853658536, + "grad_norm": 0.1390089876738523, + "kl": 0.0543212890625, + "learning_rate": 2.6377010211583354e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1990 + }, + { + "completion_length": 3507.3333740234375, + "epoch": 0.30350609756097563, + "grad_norm": 0.03755740230200464, + "kl": 0.0379638671875, + "learning_rate": 2.6371806846531663e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1991 + }, + { + "completion_length": 2569.3333740234375, + "epoch": 0.30365853658536585, + "grad_norm": 0.5068096101984769, + "kl": 0.0771484375, + "learning_rate": 2.636660026162351e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1992 + }, + { + "completion_length": 2178.0001220703125, + "epoch": 0.3038109756097561, + "grad_norm": 0.09588306863854619, + "kl": 0.0482177734375, + "learning_rate": 2.6361390458333103e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1993 + }, + { + "completion_length": 907.0000305175781, + "epoch": 0.30396341463414633, + "grad_norm": 0.15426759563547673, + "kl": 0.0692138671875, + "learning_rate": 2.6356177438135564e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1994 + }, + { + "completion_length": 2023.166748046875, + "epoch": 0.3041158536585366, + "grad_norm": 0.0908588608732591, + "kl": 0.0665283203125, + "learning_rate": 2.6350961202506933e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1995 + }, + { + "completion_length": 900.0, + "epoch": 0.3042682926829268, + "grad_norm": 0.119073539161555, + "kl": 0.0438232421875, + "learning_rate": 2.6345741752924154e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1996 + }, + { + "completion_length": 2213.5000610351562, + "epoch": 0.3044207317073171, + "grad_norm": 0.12165109222789953, + "kl": 0.0771484375, + "learning_rate": 2.6340519090865077e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1997 + }, + { + "completion_length": 2581.0, + "epoch": 0.3045731707317073, + "grad_norm": 0.0645213291328366, + "kl": 0.0443115234375, + "learning_rate": 2.6335293217808473e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 1998 + }, + { + "completion_length": 1060.6666870117188, + "epoch": 0.3047256097560976, + "grad_norm": 0.12444322453242308, + "kl": 0.0755615234375, + "learning_rate": 2.633006413523401e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 1999 + }, + { + "completion_length": 1339.6666870117188, + "epoch": 0.3048780487804878, + "grad_norm": 0.07711774268482546, + "kl": 0.052490234375, + "learning_rate": 2.6324831844622278e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2000 + }, + { + "completion_length": 655.3333435058594, + "epoch": 0.30503048780487807, + "grad_norm": 0.12949515080132926, + "kl": 0.07080078125, + "learning_rate": 2.6319596347454757e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2001 + }, + { + "completion_length": 1108.6666870117188, + "epoch": 0.3051829268292683, + "grad_norm": 2.1719044004447756, + "kl": 0.0794677734375, + "learning_rate": 2.6314357645213853e-06, + "loss": 0.0032, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2002 + }, + { + "completion_length": 1604.6666870117188, + "epoch": 0.30533536585365856, + "grad_norm": 0.0956317596509521, + "kl": 0.0509033203125, + "learning_rate": 2.6309115739382864e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2003 + }, + { + "completion_length": 2598.3333740234375, + "epoch": 0.3054878048780488, + "grad_norm": 1.5385406801331147, + "kl": 0.04718017578125, + "learning_rate": 2.6303870631446013e-06, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2004 + }, + { + "completion_length": 1704.6667175292969, + "epoch": 0.30564024390243905, + "grad_norm": 0.08343518532435051, + "kl": 0.05224609375, + "learning_rate": 2.629862232288842e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2005 + }, + { + "completion_length": 1621.5, + "epoch": 0.30579268292682926, + "grad_norm": 0.09979947690900003, + "kl": 0.0576171875, + "learning_rate": 2.62933708151961e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2006 + }, + { + "completion_length": 2556.5001220703125, + "epoch": 0.30594512195121953, + "grad_norm": 0.19604300867882615, + "kl": 0.03643798828125, + "learning_rate": 2.6288116109855988e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2007 + }, + { + "completion_length": 3431.33349609375, + "epoch": 0.30609756097560975, + "grad_norm": 0.0507493746979005, + "kl": 0.03125, + "learning_rate": 2.628285820835593e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2008 + }, + { + "completion_length": 3406.8333740234375, + "epoch": 0.30625, + "grad_norm": 0.06970957208079927, + "kl": 0.03125, + "learning_rate": 2.627759711218466e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2009 + }, + { + "completion_length": 2710.3333740234375, + "epoch": 0.30640243902439024, + "grad_norm": 0.08274032251185975, + "kl": 0.0478515625, + "learning_rate": 2.627233282283183e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2010 + }, + { + "completion_length": 4096.0, + "epoch": 0.3065548780487805, + "grad_norm": 0.040461902602616714, + "kl": 0.03070068359375, + "learning_rate": 2.6267065341787985e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2011 + }, + { + "completion_length": 2716.0, + "epoch": 0.3067073170731707, + "grad_norm": 0.11482051348252345, + "kl": 0.040771484375, + "learning_rate": 2.6261794670544584e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2012 + }, + { + "completion_length": 3237.666748046875, + "epoch": 0.306859756097561, + "grad_norm": 0.34987748594976265, + "kl": 0.0538330078125, + "learning_rate": 2.6256520810593984e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2013 + }, + { + "completion_length": 2960.3333740234375, + "epoch": 0.3070121951219512, + "grad_norm": 0.15126707753268642, + "kl": 0.05615234375, + "learning_rate": 2.625124376342944e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2014 + }, + { + "completion_length": 1041.3333740234375, + "epoch": 0.3071646341463415, + "grad_norm": 0.10565703631326515, + "kl": 0.0552978515625, + "learning_rate": 2.6245963530545125e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2015 + }, + { + "completion_length": 3627.3333740234375, + "epoch": 0.3073170731707317, + "grad_norm": 0.047579314018243654, + "kl": 0.037109375, + "learning_rate": 2.6240680113436096e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2016 + }, + { + "completion_length": 3923.0, + "epoch": 0.307469512195122, + "grad_norm": 0.043952216788974485, + "kl": 0.0289306640625, + "learning_rate": 2.623539351359833e-06, + "loss": 0.0012, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2017 + }, + { + "completion_length": 2254.666748046875, + "epoch": 0.3076219512195122, + "grad_norm": 0.10086678369996167, + "kl": 0.0596923828125, + "learning_rate": 2.623010373252868e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2018 + }, + { + "completion_length": 1736.5000915527344, + "epoch": 0.30777439024390246, + "grad_norm": 0.07983004338641968, + "kl": 0.0499267578125, + "learning_rate": 2.622481077172493e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2019 + }, + { + "completion_length": 3781.3333740234375, + "epoch": 0.3079268292682927, + "grad_norm": 0.0658581111083351, + "kl": 0.0460205078125, + "learning_rate": 2.6219514632685732e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2020 + }, + { + "completion_length": 2669.5, + "epoch": 0.30807926829268295, + "grad_norm": 0.0712521753847299, + "kl": 0.0443115234375, + "learning_rate": 2.621421531691067e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2021 + }, + { + "completion_length": 3497.8333740234375, + "epoch": 0.30823170731707317, + "grad_norm": 0.06298276793625354, + "kl": 0.0352783203125, + "learning_rate": 2.620891282590021e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2022 + }, + { + "completion_length": 3826.166748046875, + "epoch": 0.30838414634146344, + "grad_norm": 0.06678193623345079, + "kl": 0.0316162109375, + "learning_rate": 2.6203607161155706e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2023 + }, + { + "completion_length": 3044.666748046875, + "epoch": 0.30853658536585366, + "grad_norm": 0.07711798087345437, + "kl": 0.0557861328125, + "learning_rate": 2.619829832417944e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2024 + }, + { + "completion_length": 2463.8333435058594, + "epoch": 0.3086890243902439, + "grad_norm": 0.41332107179151717, + "kl": 0.0662841796875, + "learning_rate": 2.619298631647457e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2025 + }, + { + "completion_length": 2269.3334350585938, + "epoch": 0.30884146341463414, + "grad_norm": 0.07972737208643857, + "kl": 0.06884765625, + "learning_rate": 2.6187671139545158e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2026 + }, + { + "completion_length": 1287.0000610351562, + "epoch": 0.3089939024390244, + "grad_norm": 0.1437871506694, + "kl": 0.061767578125, + "learning_rate": 2.6182352794896163e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2027 + }, + { + "completion_length": 1857.3333740234375, + "epoch": 0.30914634146341463, + "grad_norm": 0.1403497740099658, + "kl": 0.0626220703125, + "learning_rate": 2.6177031284033447e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2028 + }, + { + "completion_length": 2434.0, + "epoch": 0.3092987804878049, + "grad_norm": 0.08552731799684424, + "kl": 0.0556640625, + "learning_rate": 2.6171706608463752e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2029 + }, + { + "completion_length": 2527.3333435058594, + "epoch": 0.3094512195121951, + "grad_norm": 0.14014210266614055, + "kl": 0.067138671875, + "learning_rate": 2.616637876969473e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2030 + }, + { + "completion_length": 3071.0001220703125, + "epoch": 0.3096036585365854, + "grad_norm": 0.11865482901953288, + "kl": 0.0474853515625, + "learning_rate": 2.6161047769234935e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2031 + }, + { + "completion_length": 896.6667175292969, + "epoch": 0.3097560975609756, + "grad_norm": 0.17994701094445936, + "kl": 0.0577392578125, + "learning_rate": 2.6155713608593796e-06, + "loss": 0.0023, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2032 + }, + { + "completion_length": 2005.666748046875, + "epoch": 0.3099085365853659, + "grad_norm": 0.0724641079155299, + "kl": 0.0390625, + "learning_rate": 2.6150376289281652e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2033 + }, + { + "completion_length": 3200.666748046875, + "epoch": 0.3100609756097561, + "grad_norm": 0.059479100975767335, + "kl": 0.0467529296875, + "learning_rate": 2.6145035812809726e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2034 + }, + { + "completion_length": 1969.6666870117188, + "epoch": 0.31021341463414637, + "grad_norm": 0.16206843871430657, + "kl": 0.0589599609375, + "learning_rate": 2.613969218069015e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2035 + }, + { + "completion_length": 1706.5001220703125, + "epoch": 0.3103658536585366, + "grad_norm": 0.10494853207664705, + "kl": 0.0628662109375, + "learning_rate": 2.6134345394435936e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2036 + }, + { + "completion_length": 1999.0, + "epoch": 0.31051829268292686, + "grad_norm": 1.0135689466131392, + "kl": 0.0570068359375, + "learning_rate": 2.612899545556099e-06, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2037 + }, + { + "completion_length": 2365.3333740234375, + "epoch": 0.31067073170731707, + "grad_norm": 0.08133398814402244, + "kl": 0.0390625, + "learning_rate": 2.6123642365580122e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2038 + }, + { + "completion_length": 1304.5000305175781, + "epoch": 0.31082317073170734, + "grad_norm": 0.11383617185077423, + "kl": 0.061767578125, + "learning_rate": 2.611828612600902e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2039 + }, + { + "completion_length": 1556.8333740234375, + "epoch": 0.31097560975609756, + "grad_norm": 0.10007490133223539, + "kl": 0.057373046875, + "learning_rate": 2.6112926738364267e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2040 + }, + { + "completion_length": 1221.0, + "epoch": 0.31112804878048783, + "grad_norm": 0.11153454798963952, + "kl": 0.060302734375, + "learning_rate": 2.6107564204163344e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2041 + }, + { + "completion_length": 1656.1667175292969, + "epoch": 0.31128048780487805, + "grad_norm": 0.08967517361002357, + "kl": 0.05615234375, + "learning_rate": 2.6102198524924624e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2042 + }, + { + "completion_length": 2552.3334350585938, + "epoch": 0.3114329268292683, + "grad_norm": 0.08992629323034713, + "kl": 0.0521240234375, + "learning_rate": 2.6096829702167363e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2043 + }, + { + "completion_length": 1870.166748046875, + "epoch": 0.31158536585365854, + "grad_norm": 0.09067319705254046, + "kl": 0.0643310546875, + "learning_rate": 2.6091457737411704e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2044 + }, + { + "completion_length": 3713.5, + "epoch": 0.3117378048780488, + "grad_norm": 0.10928613802049265, + "kl": 0.047119140625, + "learning_rate": 2.608608263217869e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2045 + }, + { + "completion_length": 2447.3333435058594, + "epoch": 0.311890243902439, + "grad_norm": 0.14755308228107064, + "kl": 0.0679931640625, + "learning_rate": 2.608070438799025e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2046 + }, + { + "completion_length": 4096.0, + "epoch": 0.3120426829268293, + "grad_norm": 0.06617797448557483, + "kl": 0.0423583984375, + "learning_rate": 2.60753230063692e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2047 + }, + { + "completion_length": 1834.3333435058594, + "epoch": 0.3121951219512195, + "grad_norm": 0.12073495223995993, + "kl": 0.071044921875, + "learning_rate": 2.606993848883924e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2048 + }, + { + "completion_length": 2308.8334350585938, + "epoch": 0.3123475609756098, + "grad_norm": 0.06537683747771723, + "kl": 0.0343017578125, + "learning_rate": 2.6064550836924966e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2049 + }, + { + "completion_length": 2278.5, + "epoch": 0.3125, + "grad_norm": 0.06951714875845595, + "kl": 0.04248046875, + "learning_rate": 2.605916005215186e-06, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2050 + }, + { + "completion_length": 2925.5, + "epoch": 0.3126524390243902, + "grad_norm": 0.10169916746601329, + "kl": 0.0533447265625, + "learning_rate": 2.605376613604629e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2051 + }, + { + "completion_length": 1531.6666870117188, + "epoch": 0.3128048780487805, + "grad_norm": 0.07888546459659011, + "kl": 0.0482177734375, + "learning_rate": 2.6048369090135504e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2052 + }, + { + "completion_length": 3113.5001220703125, + "epoch": 0.3129573170731707, + "grad_norm": 0.09312980239415315, + "kl": 0.0570068359375, + "learning_rate": 2.604296891594765e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2053 + }, + { + "completion_length": 2202.0000915527344, + "epoch": 0.313109756097561, + "grad_norm": 0.0887976483636807, + "kl": 0.0494384765625, + "learning_rate": 2.6037565615011744e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2054 + }, + { + "completion_length": 1108.8333740234375, + "epoch": 0.3132621951219512, + "grad_norm": 0.11462691000420165, + "kl": 0.068115234375, + "learning_rate": 2.6032159188857706e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2055 + }, + { + "completion_length": 2764.0001220703125, + "epoch": 0.31341463414634146, + "grad_norm": 0.06582884766178793, + "kl": 0.0526123046875, + "learning_rate": 2.6026749639016327e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2056 + }, + { + "completion_length": 863.0, + "epoch": 0.3135670731707317, + "grad_norm": 2.191242115796686, + "kl": 0.08935546875, + "learning_rate": 2.602133696701929e-06, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2057 + }, + { + "completion_length": 1160.3333740234375, + "epoch": 0.31371951219512195, + "grad_norm": 0.12286485305620143, + "kl": 0.077880859375, + "learning_rate": 2.6015921174399153e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2058 + }, + { + "completion_length": 1940.5, + "epoch": 0.31387195121951217, + "grad_norm": 0.08459032193697091, + "kl": 0.0582275390625, + "learning_rate": 2.6010502262689372e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2059 + }, + { + "completion_length": 1580.1666870117188, + "epoch": 0.31402439024390244, + "grad_norm": 0.11594406104865668, + "kl": 0.0614013671875, + "learning_rate": 2.600508023342428e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2060 + }, + { + "completion_length": 1153.3333435058594, + "epoch": 0.31417682926829266, + "grad_norm": 0.14678594792431587, + "kl": 0.09423828125, + "learning_rate": 2.5999655088139084e-06, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2061 + }, + { + "completion_length": 2709.8333740234375, + "epoch": 0.31432926829268293, + "grad_norm": 0.07814002169859882, + "kl": 0.067626953125, + "learning_rate": 2.599422682836988e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2062 + }, + { + "completion_length": 1302.6666870117188, + "epoch": 0.31448170731707314, + "grad_norm": 0.10521621394240976, + "kl": 0.06396484375, + "learning_rate": 2.598879545565365e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2063 + }, + { + "completion_length": 2813.666748046875, + "epoch": 0.3146341463414634, + "grad_norm": 0.060968002182257265, + "kl": 0.06640625, + "learning_rate": 2.5983360971528252e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2064 + }, + { + "completion_length": 1115.5, + "epoch": 0.31478658536585363, + "grad_norm": 0.1014980393648225, + "kl": 0.0628662109375, + "learning_rate": 2.5977923377532427e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2065 + }, + { + "completion_length": 1373.1666870117188, + "epoch": 0.3149390243902439, + "grad_norm": 0.08900509284939195, + "kl": 0.0628662109375, + "learning_rate": 2.5972482675205794e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2066 + }, + { + "completion_length": 1253.166748046875, + "epoch": 0.3150914634146341, + "grad_norm": 0.1180965527195981, + "kl": 0.07568359375, + "learning_rate": 2.596703886608886e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2067 + }, + { + "completion_length": 1469.3334350585938, + "epoch": 0.3152439024390244, + "grad_norm": 0.11305816127076838, + "kl": 0.0670166015625, + "learning_rate": 2.5961591951722993e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2068 + }, + { + "completion_length": 1270.0000610351562, + "epoch": 0.3153963414634146, + "grad_norm": 0.09219450420771846, + "kl": 0.0487060546875, + "learning_rate": 2.5956141933650465e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2069 + }, + { + "completion_length": 1648.166748046875, + "epoch": 0.3155487804878049, + "grad_norm": 0.14110159256924498, + "kl": 0.066162109375, + "learning_rate": 2.5950688813414415e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2070 + }, + { + "completion_length": 1852.5001220703125, + "epoch": 0.3157012195121951, + "grad_norm": 0.14562065984810935, + "kl": 0.072265625, + "learning_rate": 2.594523259255885e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2071 + }, + { + "completion_length": 1250.3333740234375, + "epoch": 0.31585365853658537, + "grad_norm": 1.876414853555558, + "kl": 0.0687255859375, + "learning_rate": 2.5939773272628674e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2072 + }, + { + "completion_length": 1185.8333740234375, + "epoch": 0.3160060975609756, + "grad_norm": 2.254000362291841, + "kl": 0.0814208984375, + "learning_rate": 2.593431085516966e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2073 + }, + { + "completion_length": 1328.6666870117188, + "epoch": 0.31615853658536586, + "grad_norm": 0.10854924294719082, + "kl": 0.0599365234375, + "learning_rate": 2.5928845341728445e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2074 + }, + { + "completion_length": 720.5, + "epoch": 0.3163109756097561, + "grad_norm": 0.15440993036739226, + "kl": 0.0972900390625, + "learning_rate": 2.592337673385257e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2075 + }, + { + "completion_length": 1114.1666870117188, + "epoch": 0.31646341463414634, + "grad_norm": 0.10476475534665762, + "kl": 0.0576171875, + "learning_rate": 2.5917905033090436e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2076 + }, + { + "completion_length": 1591.6667175292969, + "epoch": 0.31661585365853656, + "grad_norm": 0.1114284356300561, + "kl": 0.0673828125, + "learning_rate": 2.5912430240991313e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2077 + }, + { + "completion_length": 2628.5, + "epoch": 0.31676829268292683, + "grad_norm": 0.11695840388216752, + "kl": 0.053955078125, + "learning_rate": 2.5906952359105356e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2078 + }, + { + "completion_length": 1314.0, + "epoch": 0.31692073170731705, + "grad_norm": 0.17975632133090752, + "kl": 0.1025390625, + "learning_rate": 2.59014713889836e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2079 + }, + { + "completion_length": 2562.3333740234375, + "epoch": 0.3170731707317073, + "grad_norm": 0.11065446342512994, + "kl": 0.040283203125, + "learning_rate": 2.5895987332177935e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2080 + }, + { + "completion_length": 1092.8333740234375, + "epoch": 0.31722560975609754, + "grad_norm": 0.15073156821825964, + "kl": 0.068603515625, + "learning_rate": 2.5890500190241154e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2081 + }, + { + "completion_length": 2386.3333740234375, + "epoch": 0.3173780487804878, + "grad_norm": 0.06371357266489858, + "kl": 0.052001953125, + "learning_rate": 2.588500996472689e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2082 + }, + { + "completion_length": 2474.5001220703125, + "epoch": 0.317530487804878, + "grad_norm": 1.7659166487994173, + "kl": 0.066162109375, + "learning_rate": 2.5879516657189687e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2083 + }, + { + "completion_length": 2004.3333740234375, + "epoch": 0.3176829268292683, + "grad_norm": 0.17888395246372918, + "kl": 0.070068359375, + "learning_rate": 2.587402026918492e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2084 + }, + { + "completion_length": 2459.0, + "epoch": 0.3178353658536585, + "grad_norm": 0.12948166815468717, + "kl": 0.0606689453125, + "learning_rate": 2.5868520802268866e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2085 + }, + { + "completion_length": 2272.166717529297, + "epoch": 0.3179878048780488, + "grad_norm": 0.07417222095461033, + "kl": 0.0570068359375, + "learning_rate": 2.586301825799867e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2086 + }, + { + "completion_length": 867.3333435058594, + "epoch": 0.318140243902439, + "grad_norm": 0.154533353645472, + "kl": 0.100341796875, + "learning_rate": 2.5857512637932334e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2087 + }, + { + "completion_length": 2679.3333740234375, + "epoch": 0.3182926829268293, + "grad_norm": 0.08793998150758778, + "kl": 0.079833984375, + "learning_rate": 2.5852003943628746e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2088 + }, + { + "completion_length": 963.1666870117188, + "epoch": 0.3184451219512195, + "grad_norm": 0.20129749855165383, + "kl": 0.0869140625, + "learning_rate": 2.5846492176647658e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2089 + }, + { + "completion_length": 1393.666748046875, + "epoch": 0.31859756097560976, + "grad_norm": 0.1090479295596599, + "kl": 0.08447265625, + "learning_rate": 2.584097733854969e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2090 + }, + { + "completion_length": 2240.5001220703125, + "epoch": 0.31875, + "grad_norm": 0.14861266191346226, + "kl": 0.08251953125, + "learning_rate": 2.5835459430896333e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2091 + }, + { + "completion_length": 1209.5, + "epoch": 0.31890243902439025, + "grad_norm": 0.12218666865679298, + "kl": 0.080810546875, + "learning_rate": 2.5829938455249958e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2092 + }, + { + "completion_length": 2159.0001220703125, + "epoch": 0.31905487804878047, + "grad_norm": 0.08104378104260726, + "kl": 0.0791015625, + "learning_rate": 2.5824414413173777e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2093 + }, + { + "completion_length": 1569.5000915527344, + "epoch": 0.31920731707317074, + "grad_norm": 0.12018888899053046, + "kl": 0.0712890625, + "learning_rate": 2.581888730623191e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2094 + }, + { + "completion_length": 2203.5, + "epoch": 0.31935975609756095, + "grad_norm": 0.08341052917179681, + "kl": 0.058349609375, + "learning_rate": 2.5813357135989307e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2095 + }, + { + "completion_length": 1389.3333435058594, + "epoch": 0.3195121951219512, + "grad_norm": 0.10186668399304426, + "kl": 0.07666015625, + "learning_rate": 2.5807823904011804e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2096 + }, + { + "completion_length": 1469.666748046875, + "epoch": 0.31966463414634144, + "grad_norm": 0.15205806263622684, + "kl": 0.07275390625, + "learning_rate": 2.580228761186611e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2097 + }, + { + "completion_length": 1371.3333740234375, + "epoch": 0.3198170731707317, + "grad_norm": 0.15758731614691246, + "kl": 0.085205078125, + "learning_rate": 2.579674826111978e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2098 + }, + { + "completion_length": 922.0, + "epoch": 0.31996951219512193, + "grad_norm": 0.09143366536006155, + "kl": 0.054931640625, + "learning_rate": 2.5791205853341253e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2099 + }, + { + "completion_length": 1142.8333740234375, + "epoch": 0.3201219512195122, + "grad_norm": 0.1963681811445979, + "kl": 0.09765625, + "learning_rate": 2.578566039009983e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2100 + }, + { + "completion_length": 840.8333435058594, + "epoch": 0.3202743902439024, + "grad_norm": 2.214925901035111, + "kl": 0.0677490234375, + "learning_rate": 2.5780111872965667e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2101 + }, + { + "completion_length": 993.3333740234375, + "epoch": 0.3204268292682927, + "grad_norm": 0.15967546416575118, + "kl": 0.0810546875, + "learning_rate": 2.5774560303509794e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2102 + }, + { + "completion_length": 1200.8333740234375, + "epoch": 0.3205792682926829, + "grad_norm": 0.09547696614699593, + "kl": 0.08740234375, + "learning_rate": 2.5769005683304112e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2103 + }, + { + "completion_length": 1008.1666870117188, + "epoch": 0.3207317073170732, + "grad_norm": 0.09624650515401603, + "kl": 0.06005859375, + "learning_rate": 2.576344801392137e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2104 + }, + { + "completion_length": 1627.8333740234375, + "epoch": 0.3208841463414634, + "grad_norm": 1.3690589352064895, + "kl": 0.085205078125, + "learning_rate": 2.575788729693518e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2105 + }, + { + "completion_length": 1563.8333740234375, + "epoch": 0.32103658536585367, + "grad_norm": 0.1438653712889337, + "kl": 0.073974609375, + "learning_rate": 2.575232353392004e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2106 + }, + { + "completion_length": 1295.3333740234375, + "epoch": 0.3211890243902439, + "grad_norm": 0.176603984025507, + "kl": 0.09814453125, + "learning_rate": 2.5746756726451286e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2107 + }, + { + "completion_length": 842.8333435058594, + "epoch": 0.32134146341463415, + "grad_norm": 0.6353930824107191, + "kl": 0.091552734375, + "learning_rate": 2.5741186876105127e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2108 + }, + { + "completion_length": 996.8333740234375, + "epoch": 0.32149390243902437, + "grad_norm": 0.09046749775062081, + "kl": 0.0499267578125, + "learning_rate": 2.573561398445863e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2109 + }, + { + "completion_length": 837.1666870117188, + "epoch": 0.32164634146341464, + "grad_norm": 0.4662020581104322, + "kl": 0.083984375, + "learning_rate": 2.5730038053089725e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2110 + }, + { + "completion_length": 917.8333740234375, + "epoch": 0.32179878048780486, + "grad_norm": 0.1383169549712866, + "kl": 0.0869140625, + "learning_rate": 2.5724459083577205e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2111 + }, + { + "completion_length": 2685.5, + "epoch": 0.32195121951219513, + "grad_norm": 0.0827952622992849, + "kl": 0.06689453125, + "learning_rate": 2.571887707750072e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2112 + }, + { + "completion_length": 2101.5, + "epoch": 0.32210365853658535, + "grad_norm": 0.15738620788341057, + "kl": 0.0635986328125, + "learning_rate": 2.5713292036440775e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2113 + }, + { + "completion_length": 974.1666870117188, + "epoch": 0.3222560975609756, + "grad_norm": 0.16617546013510875, + "kl": 0.087158203125, + "learning_rate": 2.5707703961978747e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2114 + }, + { + "completion_length": 2434.33349609375, + "epoch": 0.32240853658536583, + "grad_norm": 0.11180262086054063, + "kl": 0.0751953125, + "learning_rate": 2.570211285569686e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2115 + }, + { + "completion_length": 1331.6666870117188, + "epoch": 0.3225609756097561, + "grad_norm": 0.08648540523351914, + "kl": 0.0677490234375, + "learning_rate": 2.56965187191782e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2116 + }, + { + "completion_length": 1907.5, + "epoch": 0.3227134146341463, + "grad_norm": 0.1511525859284648, + "kl": 0.07373046875, + "learning_rate": 2.569092155400672e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2117 + }, + { + "completion_length": 1477.8333435058594, + "epoch": 0.3228658536585366, + "grad_norm": 0.14025219445309697, + "kl": 0.077392578125, + "learning_rate": 2.5685321361767213e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2118 + }, + { + "completion_length": 2306.0, + "epoch": 0.3230182926829268, + "grad_norm": 0.08727986358149974, + "kl": 0.0765380859375, + "learning_rate": 2.5679718144045343e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2119 + }, + { + "completion_length": 2565.0000610351562, + "epoch": 0.3231707317073171, + "grad_norm": 0.0746727035531021, + "kl": 0.070068359375, + "learning_rate": 2.5674111902427625e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2120 + }, + { + "completion_length": 916.0000305175781, + "epoch": 0.3233231707317073, + "grad_norm": 0.11336990688937808, + "kl": 0.07861328125, + "learning_rate": 2.5668502638501433e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2121 + }, + { + "completion_length": 689.8333740234375, + "epoch": 0.32347560975609757, + "grad_norm": 0.1232331377571752, + "kl": 0.0570068359375, + "learning_rate": 2.566289035385499e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2122 + }, + { + "completion_length": 3752.3333740234375, + "epoch": 0.3236280487804878, + "grad_norm": 0.056083941558779375, + "kl": 0.0460205078125, + "learning_rate": 2.5657275050077393e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2123 + }, + { + "completion_length": 1250.8333740234375, + "epoch": 0.32378048780487806, + "grad_norm": 0.14358349081838703, + "kl": 0.083984375, + "learning_rate": 2.5651656728758566e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2124 + }, + { + "completion_length": 955.6666870117188, + "epoch": 0.3239329268292683, + "grad_norm": 0.17533068410330083, + "kl": 0.07958984375, + "learning_rate": 2.5646035391489306e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2125 + }, + { + "completion_length": 958.6666870117188, + "epoch": 0.32408536585365855, + "grad_norm": 0.2278783063880392, + "kl": 0.07421875, + "learning_rate": 2.5640411039861264e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2126 + }, + { + "completion_length": 1059.8333740234375, + "epoch": 0.32423780487804876, + "grad_norm": 0.11873321051302115, + "kl": 0.09228515625, + "learning_rate": 2.5634783675466934e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2127 + }, + { + "completion_length": 1482.3333740234375, + "epoch": 0.32439024390243903, + "grad_norm": 0.09314574498057945, + "kl": 0.068603515625, + "learning_rate": 2.5629153299899673e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2128 + }, + { + "completion_length": 1148.6666870117188, + "epoch": 0.32454268292682925, + "grad_norm": 0.1679274721072185, + "kl": 0.0665283203125, + "learning_rate": 2.5623519914753687e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2129 + }, + { + "completion_length": 1380.1666870117188, + "epoch": 0.3246951219512195, + "grad_norm": 0.09481795213267738, + "kl": 0.079345703125, + "learning_rate": 2.561788352162403e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2130 + }, + { + "completion_length": 1631.3334350585938, + "epoch": 0.32484756097560974, + "grad_norm": 0.11360789899509394, + "kl": 0.088134765625, + "learning_rate": 2.561224412210662e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2131 + }, + { + "completion_length": 2768.8333740234375, + "epoch": 0.325, + "grad_norm": 0.06802101207045, + "kl": 0.0614013671875, + "learning_rate": 2.5606601717798212e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2132 + }, + { + "completion_length": 731.6666870117188, + "epoch": 0.3251524390243902, + "grad_norm": 0.13235926655107183, + "kl": 0.06640625, + "learning_rate": 2.560095631029642e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2133 + }, + { + "completion_length": 1109.5000610351562, + "epoch": 0.3253048780487805, + "grad_norm": 0.10270923435850388, + "kl": 0.07861328125, + "learning_rate": 2.5595307901199703e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2134 + }, + { + "completion_length": 1773.666748046875, + "epoch": 0.3254573170731707, + "grad_norm": 0.08682432092020259, + "kl": 0.0594482421875, + "learning_rate": 2.5589656492107378e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2135 + }, + { + "completion_length": 1046.5000305175781, + "epoch": 0.325609756097561, + "grad_norm": 0.5957031243078468, + "kl": 0.079833984375, + "learning_rate": 2.5584002084619593e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2136 + }, + { + "completion_length": 618.5, + "epoch": 0.3257621951219512, + "grad_norm": 0.16090571609704382, + "kl": 0.068359375, + "learning_rate": 2.5578344680337373e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2137 + }, + { + "completion_length": 1762.166748046875, + "epoch": 0.3259146341463415, + "grad_norm": 0.090938932702545, + "kl": 0.064453125, + "learning_rate": 2.5572684280862575e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2138 + }, + { + "completion_length": 2090.8333740234375, + "epoch": 0.3260670731707317, + "grad_norm": 0.10243899218763776, + "kl": 0.07861328125, + "learning_rate": 2.55670208877979e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2139 + }, + { + "completion_length": 1891.666748046875, + "epoch": 0.32621951219512196, + "grad_norm": 0.12065723894873157, + "kl": 0.074951171875, + "learning_rate": 2.5561354502746907e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2140 + }, + { + "completion_length": 2694.3334350585938, + "epoch": 0.3263719512195122, + "grad_norm": 0.14032357594906508, + "kl": 0.048828125, + "learning_rate": 2.5555685127314e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2141 + }, + { + "completion_length": 1607.5000610351562, + "epoch": 0.32652439024390245, + "grad_norm": 0.11655133912973253, + "kl": 0.068115234375, + "learning_rate": 2.5550012763104423e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2142 + }, + { + "completion_length": 1038.3333740234375, + "epoch": 0.32667682926829267, + "grad_norm": 0.14175205245535868, + "kl": 0.0771484375, + "learning_rate": 2.554433741172427e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2143 + }, + { + "completion_length": 2462.3333740234375, + "epoch": 0.32682926829268294, + "grad_norm": 0.10426271506867495, + "kl": 0.0625, + "learning_rate": 2.5538659074780484e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2144 + }, + { + "completion_length": 2067.5000610351562, + "epoch": 0.32698170731707316, + "grad_norm": 0.06143604412285628, + "kl": 0.046142578125, + "learning_rate": 2.5532977753880845e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2145 + }, + { + "completion_length": 2809.3333740234375, + "epoch": 0.3271341463414634, + "grad_norm": 0.0554741631565187, + "kl": 0.0408935546875, + "learning_rate": 2.5527293450633993e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2146 + }, + { + "completion_length": 2501.3333435058594, + "epoch": 0.32728658536585364, + "grad_norm": 0.08110664728881671, + "kl": 0.0458984375, + "learning_rate": 2.5521606166649397e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2147 + }, + { + "completion_length": 2042.666748046875, + "epoch": 0.3274390243902439, + "grad_norm": 0.1409852358659705, + "kl": 0.06640625, + "learning_rate": 2.551591590353738e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2148 + }, + { + "completion_length": 2226.0001220703125, + "epoch": 0.32759146341463413, + "grad_norm": 0.07062179189395869, + "kl": 0.0465087890625, + "learning_rate": 2.55102226629091e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2149 + }, + { + "completion_length": 1857.5000610351562, + "epoch": 0.3277439024390244, + "grad_norm": 0.10573461065818136, + "kl": 0.072998046875, + "learning_rate": 2.5504526446376563e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2150 + }, + { + "completion_length": 1704.166748046875, + "epoch": 0.3278963414634146, + "grad_norm": 0.09577010155418575, + "kl": 0.074951171875, + "learning_rate": 2.5498827255552625e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2151 + }, + { + "completion_length": 1794.1667175292969, + "epoch": 0.3280487804878049, + "grad_norm": 0.291821672546512, + "kl": 0.054931640625, + "learning_rate": 2.549312509205097e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2152 + }, + { + "completion_length": 1503.6667175292969, + "epoch": 0.3282012195121951, + "grad_norm": 1.3741827297179598, + "kl": 0.0628662109375, + "learning_rate": 2.548741995748613e-06, + "loss": 0.0025, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2153 + }, + { + "completion_length": 946.6666870117188, + "epoch": 0.3283536585365854, + "grad_norm": 1.642673801382247, + "kl": 0.070068359375, + "learning_rate": 2.548171185347348e-06, + "loss": 0.0028, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 2154 + }, + { + "completion_length": 2987.83349609375, + "epoch": 0.3285060975609756, + "grad_norm": 0.04973872938196065, + "kl": 0.0455322265625, + "learning_rate": 2.5476000781629234e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2155 + }, + { + "completion_length": 3249.0, + "epoch": 0.32865853658536587, + "grad_norm": 0.05788768335893397, + "kl": 0.044921875, + "learning_rate": 2.5470286743570447e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2156 + }, + { + "completion_length": 1298.6666870117188, + "epoch": 0.3288109756097561, + "grad_norm": 0.13243432497242177, + "kl": 0.05810546875, + "learning_rate": 2.546456974091501e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2157 + }, + { + "completion_length": 2518.1666870117188, + "epoch": 0.32896341463414636, + "grad_norm": 0.08445291751697233, + "kl": 0.0565185546875, + "learning_rate": 2.545884977528166e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2158 + }, + { + "completion_length": 1709.0000915527344, + "epoch": 0.3291158536585366, + "grad_norm": 0.15261407162857132, + "kl": 0.07666015625, + "learning_rate": 2.5453126848289974e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2159 + }, + { + "completion_length": 1500.666748046875, + "epoch": 0.32926829268292684, + "grad_norm": 0.08566999660254791, + "kl": 0.0650634765625, + "learning_rate": 2.5447400961560355e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2160 + }, + { + "completion_length": 1489.6667175292969, + "epoch": 0.32942073170731706, + "grad_norm": 0.09842470409912345, + "kl": 0.07568359375, + "learning_rate": 2.5441672116714056e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2161 + }, + { + "completion_length": 3029.8333740234375, + "epoch": 0.32957317073170733, + "grad_norm": 0.08158890408051138, + "kl": 0.06103515625, + "learning_rate": 2.5435940315373163e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2162 + }, + { + "completion_length": 1968.8333740234375, + "epoch": 0.32972560975609755, + "grad_norm": 0.09804762175397821, + "kl": 0.070068359375, + "learning_rate": 2.5430205559160604e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2163 + }, + { + "completion_length": 1734.6667175292969, + "epoch": 0.3298780487804878, + "grad_norm": 0.10757925840120462, + "kl": 0.0604248046875, + "learning_rate": 2.542446784970013e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2164 + }, + { + "completion_length": 2470.5, + "epoch": 0.33003048780487804, + "grad_norm": 0.10310587375132696, + "kl": 0.0592041015625, + "learning_rate": 2.541872718861635e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2165 + }, + { + "completion_length": 2942.0, + "epoch": 0.3301829268292683, + "grad_norm": 0.0788512096887679, + "kl": 0.0498046875, + "learning_rate": 2.5412983577534684e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2166 + }, + { + "completion_length": 1574.666748046875, + "epoch": 0.3303353658536585, + "grad_norm": 0.1876576863274369, + "kl": 0.100830078125, + "learning_rate": 2.540723701808141e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2167 + }, + { + "completion_length": 2307.6666870117188, + "epoch": 0.3304878048780488, + "grad_norm": 0.10787498380046481, + "kl": 0.0645751953125, + "learning_rate": 2.5401487511883627e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2168 + }, + { + "completion_length": 2171.1666870117188, + "epoch": 0.330640243902439, + "grad_norm": 0.10419043158561984, + "kl": 0.081787109375, + "learning_rate": 2.539573506056927e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2169 + }, + { + "completion_length": 1847.8333435058594, + "epoch": 0.3307926829268293, + "grad_norm": 0.1047901829799442, + "kl": 0.04522705078125, + "learning_rate": 2.5389979665767115e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2170 + }, + { + "completion_length": 1686.0000610351562, + "epoch": 0.3309451219512195, + "grad_norm": 0.10152557520749485, + "kl": 0.07373046875, + "learning_rate": 2.538422132910676e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2171 + }, + { + "completion_length": 1063.0000305175781, + "epoch": 0.3310975609756098, + "grad_norm": 1.5314339291778265, + "kl": 0.0830078125, + "learning_rate": 2.5378460052218646e-06, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2172 + }, + { + "completion_length": 2412.3333740234375, + "epoch": 0.33125, + "grad_norm": 0.08502658261701983, + "kl": 0.045166015625, + "learning_rate": 2.5372695836734045e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2173 + }, + { + "completion_length": 1801.0000610351562, + "epoch": 0.33140243902439026, + "grad_norm": 0.16901035381869783, + "kl": 0.0927734375, + "learning_rate": 2.5366928684285052e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2174 + }, + { + "completion_length": 1248.5000610351562, + "epoch": 0.3315548780487805, + "grad_norm": 0.175550968165539, + "kl": 0.075439453125, + "learning_rate": 2.53611585965046e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2175 + }, + { + "completion_length": 2430.0, + "epoch": 0.33170731707317075, + "grad_norm": 0.0738701727984298, + "kl": 0.04345703125, + "learning_rate": 2.5355385575026464e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2176 + }, + { + "completion_length": 1137.3333740234375, + "epoch": 0.33185975609756097, + "grad_norm": 0.09953519676885852, + "kl": 0.054931640625, + "learning_rate": 2.5349609621485233e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2177 + }, + { + "completion_length": 1563.8333740234375, + "epoch": 0.33201219512195124, + "grad_norm": 0.11517399920207823, + "kl": 0.073974609375, + "learning_rate": 2.5343830737516326e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2178 + }, + { + "completion_length": 1520.3333435058594, + "epoch": 0.33216463414634145, + "grad_norm": 0.0849888018025306, + "kl": 0.068359375, + "learning_rate": 2.5338048924756015e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2179 + }, + { + "completion_length": 1690.5, + "epoch": 0.3323170731707317, + "grad_norm": 0.11669787554079966, + "kl": 0.0731201171875, + "learning_rate": 2.5332264184841366e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2180 + }, + { + "completion_length": 1642.0000610351562, + "epoch": 0.33246951219512194, + "grad_norm": 0.17809890361658248, + "kl": 0.0592041015625, + "learning_rate": 2.53264765194103e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2181 + }, + { + "completion_length": 2767.166748046875, + "epoch": 0.3326219512195122, + "grad_norm": 0.07763181929066612, + "kl": 0.071044921875, + "learning_rate": 2.532068593010156e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2182 + }, + { + "completion_length": 1840.166748046875, + "epoch": 0.33277439024390243, + "grad_norm": 0.14413749049506716, + "kl": 0.0521240234375, + "learning_rate": 2.531489241855471e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2183 + }, + { + "completion_length": 2460.8333740234375, + "epoch": 0.3329268292682927, + "grad_norm": 0.07727129212166099, + "kl": 0.056640625, + "learning_rate": 2.5309095986410155e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2184 + }, + { + "completion_length": 936.5000305175781, + "epoch": 0.3330792682926829, + "grad_norm": 0.4486890682970091, + "kl": 0.07257080078125, + "learning_rate": 2.5303296635309116e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2185 + }, + { + "completion_length": 1092.0000305175781, + "epoch": 0.3332317073170732, + "grad_norm": 0.10380838950846784, + "kl": 0.066162109375, + "learning_rate": 2.5297494366893636e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2186 + }, + { + "completion_length": 2065.5, + "epoch": 0.3333841463414634, + "grad_norm": 0.07849278056364278, + "kl": 0.0546875, + "learning_rate": 2.5291689182806597e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2187 + }, + { + "completion_length": 1408.3333740234375, + "epoch": 0.3335365853658537, + "grad_norm": 0.1545112206370161, + "kl": 0.05712890625, + "learning_rate": 2.5285881084691706e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2188 + }, + { + "completion_length": 2301.3333740234375, + "epoch": 0.3336890243902439, + "grad_norm": 0.11801538641594882, + "kl": 0.06689453125, + "learning_rate": 2.528007007419348e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2189 + }, + { + "completion_length": 2564.0000610351562, + "epoch": 0.33384146341463417, + "grad_norm": 0.074216945188725, + "kl": 0.0516357421875, + "learning_rate": 2.5274256152957276e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2190 + }, + { + "completion_length": 1916.5000610351562, + "epoch": 0.3339939024390244, + "grad_norm": 0.10923360201259662, + "kl": 0.0516357421875, + "learning_rate": 2.5268439322629273e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2191 + }, + { + "completion_length": 1280.1666870117188, + "epoch": 0.33414634146341465, + "grad_norm": 0.17603593459803038, + "kl": 0.07080078125, + "learning_rate": 2.5262619584856456e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2192 + }, + { + "completion_length": 1259.5000610351562, + "epoch": 0.33429878048780487, + "grad_norm": 0.0892092626372646, + "kl": 0.066162109375, + "learning_rate": 2.5256796941286665e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2193 + }, + { + "completion_length": 1682.5000610351562, + "epoch": 0.33445121951219514, + "grad_norm": 0.15041768621120383, + "kl": 0.0635986328125, + "learning_rate": 2.5250971393568538e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2194 + }, + { + "completion_length": 2231.0001220703125, + "epoch": 0.33460365853658536, + "grad_norm": 0.08584780233557639, + "kl": 0.062744140625, + "learning_rate": 2.524514294335154e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2195 + }, + { + "completion_length": 899.0000305175781, + "epoch": 0.33475609756097563, + "grad_norm": 0.11138224344830894, + "kl": 0.05517578125, + "learning_rate": 2.5239311592285966e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2196 + }, + { + "completion_length": 1484.8333435058594, + "epoch": 0.33490853658536585, + "grad_norm": 0.0841467502685087, + "kl": 0.054443359375, + "learning_rate": 2.523347734202292e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2197 + }, + { + "completion_length": 1478.5000610351562, + "epoch": 0.3350609756097561, + "grad_norm": 0.11134420911211325, + "kl": 0.0606689453125, + "learning_rate": 2.5227640194214343e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2198 + }, + { + "completion_length": 1335.3333740234375, + "epoch": 0.33521341463414633, + "grad_norm": 0.09750690269720631, + "kl": 0.0572509765625, + "learning_rate": 2.5221800150512974e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2199 + }, + { + "completion_length": 2418.3334350585938, + "epoch": 0.3353658536585366, + "grad_norm": 0.14607331661625592, + "kl": 0.0537109375, + "learning_rate": 2.52159572125724e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2200 + }, + { + "completion_length": 2094.8333740234375, + "epoch": 0.3355182926829268, + "grad_norm": 0.09199966970435994, + "kl": 0.0635986328125, + "learning_rate": 2.5210111382047004e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2201 + }, + { + "completion_length": 1638.3333740234375, + "epoch": 0.3356707317073171, + "grad_norm": 0.13274133733639, + "kl": 0.056884765625, + "learning_rate": 2.5204262660591994e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2202 + }, + { + "completion_length": 2813.666748046875, + "epoch": 0.3358231707317073, + "grad_norm": 0.06495854300234981, + "kl": 0.0546875, + "learning_rate": 2.5198411049863407e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2203 + }, + { + "completion_length": 1350.1666870117188, + "epoch": 0.3359756097560976, + "grad_norm": 0.13234328315663887, + "kl": 0.0648193359375, + "learning_rate": 2.5192556551518086e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2204 + }, + { + "completion_length": 1292.3333740234375, + "epoch": 0.3361280487804878, + "grad_norm": 0.08429214567595537, + "kl": 0.0672607421875, + "learning_rate": 2.5186699167213695e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2205 + }, + { + "completion_length": 2433.0000610351562, + "epoch": 0.33628048780487807, + "grad_norm": 0.10290822089501997, + "kl": 0.0587158203125, + "learning_rate": 2.5180838898608718e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2206 + }, + { + "completion_length": 2466.3333740234375, + "epoch": 0.3364329268292683, + "grad_norm": 0.09414536393682084, + "kl": 0.0703125, + "learning_rate": 2.5174975747362456e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2207 + }, + { + "completion_length": 1077.3333435058594, + "epoch": 0.33658536585365856, + "grad_norm": 0.09522666874850104, + "kl": 0.0572509765625, + "learning_rate": 2.5169109715135015e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2208 + }, + { + "completion_length": 1472.3333740234375, + "epoch": 0.3367378048780488, + "grad_norm": 0.0697867002321321, + "kl": 0.04827880859375, + "learning_rate": 2.5163240803587338e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2209 + }, + { + "completion_length": 1336.8333740234375, + "epoch": 0.33689024390243905, + "grad_norm": 0.11912093046993275, + "kl": 0.06787109375, + "learning_rate": 2.5157369014381158e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2210 + }, + { + "completion_length": 1265.3333740234375, + "epoch": 0.33704268292682926, + "grad_norm": 0.07522083853497906, + "kl": 0.0435791015625, + "learning_rate": 2.5151494349179044e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2211 + }, + { + "completion_length": 1011.3333740234375, + "epoch": 0.33719512195121953, + "grad_norm": 0.13253569604692353, + "kl": 0.0606689453125, + "learning_rate": 2.514561680964437e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2212 + }, + { + "completion_length": 856.0000305175781, + "epoch": 0.33734756097560975, + "grad_norm": 0.07512154847277072, + "kl": 0.04217529296875, + "learning_rate": 2.513973639744132e-06, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2213 + }, + { + "completion_length": 2449.8333740234375, + "epoch": 0.3375, + "grad_norm": 0.13889582947803245, + "kl": 0.0472412109375, + "learning_rate": 2.5133853114234908e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2214 + }, + { + "completion_length": 1285.3333740234375, + "epoch": 0.33765243902439024, + "grad_norm": 0.10714157967101161, + "kl": 0.07275390625, + "learning_rate": 2.5127966961690934e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2215 + }, + { + "completion_length": 2558.5001220703125, + "epoch": 0.3378048780487805, + "grad_norm": 1.4679544905541297, + "kl": 0.053466796875, + "learning_rate": 2.512207794147603e-06, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2216 + }, + { + "completion_length": 894.1666870117188, + "epoch": 0.3379573170731707, + "grad_norm": 0.11359027618371098, + "kl": 0.0599365234375, + "learning_rate": 2.5116186055257646e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2217 + }, + { + "completion_length": 973.1666870117188, + "epoch": 0.338109756097561, + "grad_norm": 1.6700825684082294, + "kl": 0.0565185546875, + "learning_rate": 2.5110291304704016e-06, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2218 + }, + { + "completion_length": 1087.6666870117188, + "epoch": 0.3382621951219512, + "grad_norm": 0.15345793441199174, + "kl": 0.0859375, + "learning_rate": 2.510439369148422e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2219 + }, + { + "completion_length": 1219.5000305175781, + "epoch": 0.3384146341463415, + "grad_norm": 0.08203244023471314, + "kl": 0.03759765625, + "learning_rate": 2.5098493217268116e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2220 + }, + { + "completion_length": 1841.8334350585938, + "epoch": 0.3385670731707317, + "grad_norm": 0.125889499586013, + "kl": 0.0531005859375, + "learning_rate": 2.509258988372639e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2221 + }, + { + "completion_length": 1735.6667175292969, + "epoch": 0.338719512195122, + "grad_norm": 0.10401093147492044, + "kl": 0.052734375, + "learning_rate": 2.5086683692530538e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2222 + }, + { + "completion_length": 1983.3334350585938, + "epoch": 0.3388719512195122, + "grad_norm": 1.7465429633357483, + "kl": 0.078857421875, + "learning_rate": 2.508077464535286e-06, + "loss": 0.0032, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2223 + }, + { + "completion_length": 989.5000305175781, + "epoch": 0.33902439024390246, + "grad_norm": 0.10771612994161153, + "kl": 0.06298828125, + "learning_rate": 2.507486274386647e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2224 + }, + { + "completion_length": 1711.1666870117188, + "epoch": 0.3391768292682927, + "grad_norm": 0.15143792606467177, + "kl": 0.072998046875, + "learning_rate": 2.5068947989745276e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2225 + }, + { + "completion_length": 2042.0001220703125, + "epoch": 0.33932926829268295, + "grad_norm": 0.10083304959573218, + "kl": 0.0743408203125, + "learning_rate": 2.506303038466401e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2226 + }, + { + "completion_length": 1524.3333740234375, + "epoch": 0.33948170731707317, + "grad_norm": 1.4953958172562092, + "kl": 0.0712890625, + "learning_rate": 2.505710993029821e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2227 + }, + { + "completion_length": 1845.0001220703125, + "epoch": 0.33963414634146344, + "grad_norm": 1.4378238545733568, + "kl": 0.070068359375, + "learning_rate": 2.505118662832421e-06, + "loss": 0.0028, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2228 + }, + { + "completion_length": 2456.166748046875, + "epoch": 0.33978658536585366, + "grad_norm": 0.18075662141359822, + "kl": 0.0859375, + "learning_rate": 2.504526048041915e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2229 + }, + { + "completion_length": 867.6666870117188, + "epoch": 0.3399390243902439, + "grad_norm": 0.35432750867301016, + "kl": 0.06787109375, + "learning_rate": 2.5039331488260988e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2230 + }, + { + "completion_length": 977.0000610351562, + "epoch": 0.34009146341463414, + "grad_norm": 0.11969275819004126, + "kl": 0.080810546875, + "learning_rate": 2.5033399653528488e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2231 + }, + { + "completion_length": 1835.1666870117188, + "epoch": 0.3402439024390244, + "grad_norm": 0.18726028991052482, + "kl": 0.078125, + "learning_rate": 2.5027464977901206e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2232 + }, + { + "completion_length": 2238.3334350585938, + "epoch": 0.34039634146341463, + "grad_norm": 0.07659761943973176, + "kl": 0.0596923828125, + "learning_rate": 2.5021527463059507e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2233 + }, + { + "completion_length": 1375.166748046875, + "epoch": 0.3405487804878049, + "grad_norm": 0.935517610870034, + "kl": 0.078125, + "learning_rate": 2.5015587110684565e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2234 + }, + { + "completion_length": 1285.5000610351562, + "epoch": 0.3407012195121951, + "grad_norm": 3.365873500203678, + "kl": 0.0714111328125, + "learning_rate": 2.500964392245835e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2235 + }, + { + "completion_length": 1769.3333740234375, + "epoch": 0.3408536585365854, + "grad_norm": 1.1392372835200177, + "kl": 0.06201171875, + "learning_rate": 2.5003697900063643e-06, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2236 + }, + { + "completion_length": 2226.666748046875, + "epoch": 0.3410060975609756, + "grad_norm": 0.08486332176720159, + "kl": 0.0599365234375, + "learning_rate": 2.499774904518402e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2237 + }, + { + "completion_length": 2619.3333740234375, + "epoch": 0.3411585365853659, + "grad_norm": 0.10303377464185137, + "kl": 0.078125, + "learning_rate": 2.4991797359503862e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2238 + }, + { + "completion_length": 1684.166748046875, + "epoch": 0.3413109756097561, + "grad_norm": 0.4837887726615985, + "kl": 0.089599609375, + "learning_rate": 2.4985842844708354e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2239 + }, + { + "completion_length": 2012.0000610351562, + "epoch": 0.34146341463414637, + "grad_norm": 1.8420997682735636, + "kl": 0.0665283203125, + "learning_rate": 2.4979885502483478e-06, + "loss": 0.0027, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2240 + }, + { + "completion_length": 3210.3333740234375, + "epoch": 0.3416158536585366, + "grad_norm": 0.11186627816984959, + "kl": 0.072509765625, + "learning_rate": 2.4973925334516027e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2241 + }, + { + "completion_length": 3714.5001220703125, + "epoch": 0.34176829268292686, + "grad_norm": 0.05184698650494866, + "kl": 0.0411376953125, + "learning_rate": 2.496796234249357e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2242 + }, + { + "completion_length": 3163.0001220703125, + "epoch": 0.34192073170731707, + "grad_norm": 0.09013539423806055, + "kl": 0.060302734375, + "learning_rate": 2.4961996528104504e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2243 + }, + { + "completion_length": 1956.5, + "epoch": 0.34207317073170734, + "grad_norm": 0.11698435855001105, + "kl": 0.10498046875, + "learning_rate": 2.4956027893038004e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2244 + }, + { + "completion_length": 2694.0, + "epoch": 0.34222560975609756, + "grad_norm": 0.8942926736488334, + "kl": 0.067138671875, + "learning_rate": 2.4950056438984056e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2245 + }, + { + "completion_length": 4096.0, + "epoch": 0.34237804878048783, + "grad_norm": 0.19072567235354757, + "kl": 0.03277587890625, + "learning_rate": 2.4944082167633443e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2246 + }, + { + "completion_length": 2419.5000610351562, + "epoch": 0.34253048780487805, + "grad_norm": 1.2313746266775638, + "kl": 0.0849609375, + "learning_rate": 2.493810508067774e-06, + "loss": 0.0034, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2247 + }, + { + "completion_length": 3565.5, + "epoch": 0.3426829268292683, + "grad_norm": 0.16711640205432896, + "kl": 0.0604248046875, + "learning_rate": 2.4932125179809316e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2248 + }, + { + "completion_length": 4096.0, + "epoch": 0.34283536585365854, + "grad_norm": 0.059282624698742875, + "kl": 0.0447998046875, + "learning_rate": 2.4926142466721353e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2249 + }, + { + "completion_length": 3112.0, + "epoch": 0.3429878048780488, + "grad_norm": 0.08709475062807955, + "kl": 0.0594482421875, + "learning_rate": 2.492015694310781e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2250 + }, + { + "completion_length": 4096.0, + "epoch": 0.343140243902439, + "grad_norm": 0.057531685513722786, + "kl": 0.048583984375, + "learning_rate": 2.491416861066346e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2251 + }, + { + "completion_length": 4096.0, + "epoch": 0.3432926829268293, + "grad_norm": 0.05064657961431565, + "kl": 0.04248046875, + "learning_rate": 2.4908177471083855e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2252 + }, + { + "completion_length": 3438.3333740234375, + "epoch": 0.3434451219512195, + "grad_norm": 0.12868572007074336, + "kl": 0.0654296875, + "learning_rate": 2.4902183526065353e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2253 + }, + { + "completion_length": 3371.166748046875, + "epoch": 0.3435975609756098, + "grad_norm": 0.09799759990099444, + "kl": 0.069091796875, + "learning_rate": 2.4896186777305097e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2254 + }, + { + "completion_length": 4096.0, + "epoch": 0.34375, + "grad_norm": 0.043282918022978636, + "kl": 0.0447998046875, + "learning_rate": 2.4890187226501035e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2255 + }, + { + "completion_length": 2408.166748046875, + "epoch": 0.3439024390243902, + "grad_norm": 0.4213509457360713, + "kl": 0.095703125, + "learning_rate": 2.4884184875351897e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2256 + }, + { + "completion_length": 4096.0, + "epoch": 0.3440548780487805, + "grad_norm": 0.03724092519577007, + "kl": 0.047607421875, + "learning_rate": 2.487817972555722e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2257 + }, + { + "completion_length": 4096.0, + "epoch": 0.3442073170731707, + "grad_norm": 0.043184137581808134, + "kl": 0.05126953125, + "learning_rate": 2.4872171778817317e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2258 + }, + { + "completion_length": 2709.0, + "epoch": 0.344359756097561, + "grad_norm": 0.09046300549705497, + "kl": 0.074951171875, + "learning_rate": 2.48661610368333e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2259 + }, + { + "completion_length": 3872.8333740234375, + "epoch": 0.3445121951219512, + "grad_norm": 0.03621111709026828, + "kl": 0.04443359375, + "learning_rate": 2.486014750130708e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2260 + }, + { + "completion_length": 4096.0, + "epoch": 0.34466463414634146, + "grad_norm": 0.04005075123434387, + "kl": 0.0517578125, + "learning_rate": 2.485413117394135e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2261 + }, + { + "completion_length": 2754.166748046875, + "epoch": 0.3448170731707317, + "grad_norm": 0.10850222179290463, + "kl": 0.0731201171875, + "learning_rate": 2.4848112056439596e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2262 + }, + { + "completion_length": 1344.6667175292969, + "epoch": 0.34496951219512195, + "grad_norm": 0.35394802557369576, + "kl": 0.1318359375, + "learning_rate": 2.484209015050609e-06, + "loss": 0.0053, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2263 + }, + { + "completion_length": 3590.5001220703125, + "epoch": 0.34512195121951217, + "grad_norm": 0.03843009154642639, + "kl": 0.052978515625, + "learning_rate": 2.48360654578459e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2264 + }, + { + "completion_length": 2581.6666870117188, + "epoch": 0.34527439024390244, + "grad_norm": 0.15919948326936922, + "kl": 0.094970703125, + "learning_rate": 2.4830037980164877e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2265 + }, + { + "completion_length": 2673.8333740234375, + "epoch": 0.34542682926829266, + "grad_norm": 0.1850540472388494, + "kl": 0.10888671875, + "learning_rate": 2.4824007719169666e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2266 + }, + { + "completion_length": 1777.5000915527344, + "epoch": 0.34557926829268293, + "grad_norm": 0.11189501656343055, + "kl": 0.08203125, + "learning_rate": 2.4817974676567706e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2267 + }, + { + "completion_length": 2783.83349609375, + "epoch": 0.34573170731707314, + "grad_norm": 0.06868067671982947, + "kl": 0.084716796875, + "learning_rate": 2.48119388540672e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2268 + }, + { + "completion_length": 1943.1666870117188, + "epoch": 0.3458841463414634, + "grad_norm": 0.12468358784835044, + "kl": 0.113037109375, + "learning_rate": 2.480590025337717e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2269 + }, + { + "completion_length": 4073.0, + "epoch": 0.34603658536585363, + "grad_norm": 0.04406499694807386, + "kl": 0.058349609375, + "learning_rate": 2.4799858876207393e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2270 + }, + { + "completion_length": 2796.5, + "epoch": 0.3461890243902439, + "grad_norm": 0.050426649796182284, + "kl": 0.051513671875, + "learning_rate": 2.479381472426846e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2271 + }, + { + "completion_length": 2510.6666870117188, + "epoch": 0.3463414634146341, + "grad_norm": 0.1497621099168, + "kl": 0.0921630859375, + "learning_rate": 2.4787767799271725e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2272 + }, + { + "completion_length": 3645.5001220703125, + "epoch": 0.3464939024390244, + "grad_norm": 0.03929227679411811, + "kl": 0.061767578125, + "learning_rate": 2.4781718102929343e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2273 + }, + { + "completion_length": 2481.8333740234375, + "epoch": 0.3466463414634146, + "grad_norm": 0.08856764108359517, + "kl": 0.0865478515625, + "learning_rate": 2.477566563695425e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2274 + }, + { + "completion_length": 882.3333435058594, + "epoch": 0.3467987804878049, + "grad_norm": 0.15725501091373265, + "kl": 0.1259765625, + "learning_rate": 2.4769610403060155e-06, + "loss": 0.005, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2275 + }, + { + "completion_length": 1418.5, + "epoch": 0.3469512195121951, + "grad_norm": 0.11814739016965986, + "kl": 0.0953369140625, + "learning_rate": 2.476355240296157e-06, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2276 + }, + { + "completion_length": 1949.666748046875, + "epoch": 0.34710365853658537, + "grad_norm": 0.09211647998147253, + "kl": 0.107421875, + "learning_rate": 2.475749163837377e-06, + "loss": 0.0043, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2277 + }, + { + "completion_length": 646.0000305175781, + "epoch": 0.3472560975609756, + "grad_norm": 0.9299892123445398, + "kl": 0.15966796875, + "learning_rate": 2.4751428111012838e-06, + "loss": 0.0064, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2278 + }, + { + "completion_length": 3196.33349609375, + "epoch": 0.34740853658536586, + "grad_norm": 0.07290477090108999, + "kl": 0.0693359375, + "learning_rate": 2.4745361822595613e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2279 + }, + { + "completion_length": 3857.666748046875, + "epoch": 0.3475609756097561, + "grad_norm": 0.10558326155704413, + "kl": 0.0657958984375, + "learning_rate": 2.473929277483972e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2280 + }, + { + "completion_length": 2981.0, + "epoch": 0.34771341463414634, + "grad_norm": 0.1432344198436365, + "kl": 0.0574951171875, + "learning_rate": 2.4733220969463588e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2281 + }, + { + "completion_length": 3519.166748046875, + "epoch": 0.34786585365853656, + "grad_norm": 0.054349658041482146, + "kl": 0.0550537109375, + "learning_rate": 2.47271464081864e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2282 + }, + { + "completion_length": 2240.3334350585938, + "epoch": 0.34801829268292683, + "grad_norm": 0.1105648156884935, + "kl": 0.0855712890625, + "learning_rate": 2.472106909272814e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2283 + }, + { + "completion_length": 1849.1667175292969, + "epoch": 0.34817073170731705, + "grad_norm": 0.11623347375796374, + "kl": 0.091064453125, + "learning_rate": 2.4714989024809555e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2284 + }, + { + "completion_length": 3280.666748046875, + "epoch": 0.3483231707317073, + "grad_norm": 0.043554865198065854, + "kl": 0.0511474609375, + "learning_rate": 2.4708906206152176e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2285 + }, + { + "completion_length": 2711.1666870117188, + "epoch": 0.34847560975609754, + "grad_norm": 0.1472294329767038, + "kl": 0.071044921875, + "learning_rate": 2.4702820638478323e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2286 + }, + { + "completion_length": 3978.3333740234375, + "epoch": 0.3486280487804878, + "grad_norm": 0.03837656234889542, + "kl": 0.042724609375, + "learning_rate": 2.4696732323511076e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2287 + }, + { + "completion_length": 3540.666748046875, + "epoch": 0.348780487804878, + "grad_norm": 0.060249371392420224, + "kl": 0.064453125, + "learning_rate": 2.4690641262974317e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2288 + }, + { + "completion_length": 2804.0000610351562, + "epoch": 0.3489329268292683, + "grad_norm": 0.1668509362739493, + "kl": 0.0882568359375, + "learning_rate": 2.468454745859268e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2289 + }, + { + "completion_length": 2389.8333435058594, + "epoch": 0.3490853658536585, + "grad_norm": 0.10553450851373518, + "kl": 0.0909423828125, + "learning_rate": 2.46784509120916e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2290 + }, + { + "completion_length": 2925.6666870117188, + "epoch": 0.3492378048780488, + "grad_norm": 0.07233969596191975, + "kl": 0.0643310546875, + "learning_rate": 2.4672351625197264e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2291 + }, + { + "completion_length": 1244.0000305175781, + "epoch": 0.349390243902439, + "grad_norm": 0.09444606176785074, + "kl": 0.087646484375, + "learning_rate": 2.4666249599636654e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2292 + }, + { + "completion_length": 2952.8333740234375, + "epoch": 0.3495426829268293, + "grad_norm": 0.08670991183881693, + "kl": 0.0601806640625, + "learning_rate": 2.4660144837137518e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2293 + }, + { + "completion_length": 1535.0, + "epoch": 0.3496951219512195, + "grad_norm": 0.1717620888926136, + "kl": 0.090576171875, + "learning_rate": 2.4654037339428383e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2294 + }, + { + "completion_length": 2386.666748046875, + "epoch": 0.34984756097560976, + "grad_norm": 0.08932788935379987, + "kl": 0.080810546875, + "learning_rate": 2.464792710823855e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2295 + }, + { + "completion_length": 1862.666748046875, + "epoch": 0.35, + "grad_norm": 0.11483071325172181, + "kl": 0.087646484375, + "learning_rate": 2.464181414529809e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2296 + }, + { + "completion_length": 3048.3333740234375, + "epoch": 0.35015243902439025, + "grad_norm": 0.11855586517688796, + "kl": 0.0811767578125, + "learning_rate": 2.4635698452337855e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2297 + }, + { + "completion_length": 2462.5, + "epoch": 0.35030487804878047, + "grad_norm": 0.07016161924221177, + "kl": 0.0574951171875, + "learning_rate": 2.462958003108946e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2298 + }, + { + "completion_length": 2030.0, + "epoch": 0.35045731707317074, + "grad_norm": 0.06817299468396898, + "kl": 0.0679931640625, + "learning_rate": 2.4623458883285303e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2299 + }, + { + "completion_length": 1877.1667175292969, + "epoch": 0.35060975609756095, + "grad_norm": 0.23369312898135178, + "kl": 0.0697021484375, + "learning_rate": 2.4617335010658546e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2300 + }, + { + "completion_length": 2197.3334197998047, + "epoch": 0.3507621951219512, + "grad_norm": 0.09096387122311728, + "kl": 0.0850830078125, + "learning_rate": 2.4611208414943125e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2301 + }, + { + "completion_length": 2070.666748046875, + "epoch": 0.35091463414634144, + "grad_norm": 0.06929088494483207, + "kl": 0.069091796875, + "learning_rate": 2.460507909787375e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2302 + }, + { + "completion_length": 2252.666748046875, + "epoch": 0.3510670731707317, + "grad_norm": 0.10051927170733073, + "kl": 0.08935546875, + "learning_rate": 2.4598947061185893e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2303 + }, + { + "completion_length": 2315.5, + "epoch": 0.35121951219512193, + "grad_norm": 0.09951094618852233, + "kl": 0.078369140625, + "learning_rate": 2.4592812306615812e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2304 + }, + { + "completion_length": 1679.8334350585938, + "epoch": 0.3513719512195122, + "grad_norm": 0.10524662602068768, + "kl": 0.086669921875, + "learning_rate": 2.4586674835900518e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2305 + }, + { + "completion_length": 1023.3333740234375, + "epoch": 0.3515243902439024, + "grad_norm": 0.21739579089729782, + "kl": 0.133544921875, + "learning_rate": 2.4580534650777804e-06, + "loss": 0.0053, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2306 + }, + { + "completion_length": 1769.5, + "epoch": 0.3516768292682927, + "grad_norm": 0.1201543116718674, + "kl": 0.078857421875, + "learning_rate": 2.457439175298621e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2307 + }, + { + "completion_length": 2196.0, + "epoch": 0.3518292682926829, + "grad_norm": 0.14749151664703364, + "kl": 0.0926513671875, + "learning_rate": 2.456824614426508e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2308 + }, + { + "completion_length": 2100.666717529297, + "epoch": 0.3519817073170732, + "grad_norm": 0.08023776053486838, + "kl": 0.080810546875, + "learning_rate": 2.4562097826354488e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2309 + }, + { + "completion_length": 1510.0000610351562, + "epoch": 0.3521341463414634, + "grad_norm": 0.10806714373575295, + "kl": 0.080078125, + "learning_rate": 2.4555946800995298e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2310 + }, + { + "completion_length": 2475.166748046875, + "epoch": 0.35228658536585367, + "grad_norm": 0.059798103694803866, + "kl": 0.052734375, + "learning_rate": 2.4549793069929144e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2311 + }, + { + "completion_length": 1576.8333740234375, + "epoch": 0.3524390243902439, + "grad_norm": 0.11995499208915268, + "kl": 0.075927734375, + "learning_rate": 2.4543636634898398e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2312 + }, + { + "completion_length": 1634.8334350585938, + "epoch": 0.35259146341463415, + "grad_norm": 0.14283551813689838, + "kl": 0.0687255859375, + "learning_rate": 2.453747749764623e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2313 + }, + { + "completion_length": 2381.5001220703125, + "epoch": 0.35274390243902437, + "grad_norm": 0.13677223556552373, + "kl": 0.1337890625, + "learning_rate": 2.4531315659916557e-06, + "loss": 0.0053, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2314 + }, + { + "completion_length": 1761.166748046875, + "epoch": 0.35289634146341464, + "grad_norm": 0.5322850126997773, + "kl": 0.0908203125, + "learning_rate": 2.452515112345407e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2315 + }, + { + "completion_length": 661.0, + "epoch": 0.35304878048780486, + "grad_norm": 0.1569013274441328, + "kl": 0.112548828125, + "learning_rate": 2.4518983890004216e-06, + "loss": 0.0045, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2316 + }, + { + "completion_length": 3452.3333740234375, + "epoch": 0.35320121951219513, + "grad_norm": 0.06829292237210162, + "kl": 0.0447998046875, + "learning_rate": 2.45128139613132e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2317 + }, + { + "completion_length": 2809.3333740234375, + "epoch": 0.35335365853658535, + "grad_norm": 0.06432582820269321, + "kl": 0.06005859375, + "learning_rate": 2.450664133912801e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2318 + }, + { + "completion_length": 1505.0000610351562, + "epoch": 0.3535060975609756, + "grad_norm": 0.17728333895923565, + "kl": 0.088134765625, + "learning_rate": 2.4500466025196387e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2319 + }, + { + "completion_length": 1900.5, + "epoch": 0.35365853658536583, + "grad_norm": 1.05175164072943, + "kl": 0.09521484375, + "learning_rate": 2.4494288021266825e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2320 + }, + { + "completion_length": 2005.8333740234375, + "epoch": 0.3538109756097561, + "grad_norm": 0.07713578368463472, + "kl": 0.069580078125, + "learning_rate": 2.4488107329088593e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2321 + }, + { + "completion_length": 3336.5, + "epoch": 0.3539634146341463, + "grad_norm": 0.056279994926938436, + "kl": 0.0570068359375, + "learning_rate": 2.448192395041171e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2322 + }, + { + "completion_length": 2407.666748046875, + "epoch": 0.3541158536585366, + "grad_norm": 0.09547951329502313, + "kl": 0.059326171875, + "learning_rate": 2.447573788698697e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2323 + }, + { + "completion_length": 1453.0000610351562, + "epoch": 0.3542682926829268, + "grad_norm": 0.09616516771945442, + "kl": 0.094482421875, + "learning_rate": 2.446954914056591e-06, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2324 + }, + { + "completion_length": 2367.166748046875, + "epoch": 0.3544207317073171, + "grad_norm": 0.1727343224478639, + "kl": 0.075439453125, + "learning_rate": 2.4463357712900834e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2325 + }, + { + "completion_length": 2535.83349609375, + "epoch": 0.3545731707317073, + "grad_norm": 0.38241598672137495, + "kl": 0.081787109375, + "learning_rate": 2.4457163605744812e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2326 + }, + { + "completion_length": 2120.666748046875, + "epoch": 0.35472560975609757, + "grad_norm": 0.08868975498807698, + "kl": 0.088623046875, + "learning_rate": 2.445096682085167e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2327 + }, + { + "completion_length": 3603.0, + "epoch": 0.3548780487804878, + "grad_norm": 0.035928036795971126, + "kl": 0.0496826171875, + "learning_rate": 2.444476735997598e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2328 + }, + { + "completion_length": 1497.3333740234375, + "epoch": 0.35503048780487806, + "grad_norm": 0.8506640788083568, + "kl": 0.108642578125, + "learning_rate": 2.443856522487309e-06, + "loss": 0.0043, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2329 + }, + { + "completion_length": 1324.5000610351562, + "epoch": 0.3551829268292683, + "grad_norm": 0.08336623355320567, + "kl": 0.082763671875, + "learning_rate": 2.443236041729909e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2330 + }, + { + "completion_length": 3110.0001220703125, + "epoch": 0.35533536585365855, + "grad_norm": 0.059676021487897093, + "kl": 0.0489501953125, + "learning_rate": 2.442615293901083e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2331 + }, + { + "completion_length": 1170.3333435058594, + "epoch": 0.35548780487804876, + "grad_norm": 0.1594099292449867, + "kl": 0.0849609375, + "learning_rate": 2.4419942791765926e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2332 + }, + { + "completion_length": 2137.8333740234375, + "epoch": 0.35564024390243903, + "grad_norm": 0.1557531493543967, + "kl": 0.0694580078125, + "learning_rate": 2.441372997732274e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2333 + }, + { + "completion_length": 1911.8333740234375, + "epoch": 0.35579268292682925, + "grad_norm": 0.08369138204765082, + "kl": 0.0770263671875, + "learning_rate": 2.4407514497440396e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2334 + }, + { + "completion_length": 2325.5, + "epoch": 0.3559451219512195, + "grad_norm": 0.10514748476079536, + "kl": 0.0908203125, + "learning_rate": 2.4401296353878756e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2335 + }, + { + "completion_length": 1688.1667175292969, + "epoch": 0.35609756097560974, + "grad_norm": 0.10431197961472315, + "kl": 0.08544921875, + "learning_rate": 2.439507554839846e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2336 + }, + { + "completion_length": 1481.0000610351562, + "epoch": 0.35625, + "grad_norm": 22.246205591500836, + "kl": 0.19287109375, + "learning_rate": 2.438885208276089e-06, + "loss": 0.0077, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2337 + }, + { + "completion_length": 1313.5000305175781, + "epoch": 0.3564024390243902, + "grad_norm": 0.16913187632942617, + "kl": 0.091552734375, + "learning_rate": 2.4382625958728174e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2338 + }, + { + "completion_length": 936.8333435058594, + "epoch": 0.3565548780487805, + "grad_norm": 0.10318430379383021, + "kl": 0.085693359375, + "learning_rate": 2.4376397178063205e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2339 + }, + { + "completion_length": 1499.0000610351562, + "epoch": 0.3567073170731707, + "grad_norm": 0.0769798744446923, + "kl": 0.0692138671875, + "learning_rate": 2.4370165742529625e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2340 + }, + { + "completion_length": 1001.0000305175781, + "epoch": 0.356859756097561, + "grad_norm": 0.12085602063375224, + "kl": 0.078369140625, + "learning_rate": 2.436393165389183e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2341 + }, + { + "completion_length": 1374.5000610351562, + "epoch": 0.3570121951219512, + "grad_norm": 0.16521197250465747, + "kl": 0.072509765625, + "learning_rate": 2.4357694913914953e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2342 + }, + { + "completion_length": 2410.3333740234375, + "epoch": 0.3571646341463415, + "grad_norm": 0.0900465952691597, + "kl": 0.0606689453125, + "learning_rate": 2.435145552436489e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2343 + }, + { + "completion_length": 1798.0000915527344, + "epoch": 0.3573170731707317, + "grad_norm": 0.2650729385549408, + "kl": 0.1025390625, + "learning_rate": 2.4345213487008296e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2344 + }, + { + "completion_length": 1088.5000305175781, + "epoch": 0.35746951219512196, + "grad_norm": 0.14161793380365978, + "kl": 0.0888671875, + "learning_rate": 2.433896880361256e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2345 + }, + { + "completion_length": 1741.6666870117188, + "epoch": 0.3576219512195122, + "grad_norm": 0.12114490263178847, + "kl": 0.075927734375, + "learning_rate": 2.4332721475945815e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2346 + }, + { + "completion_length": 1361.3333740234375, + "epoch": 0.35777439024390245, + "grad_norm": 0.10257598166324819, + "kl": 0.1015625, + "learning_rate": 2.4326471505776967e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2347 + }, + { + "completion_length": 999.0000305175781, + "epoch": 0.35792682926829267, + "grad_norm": 0.12931579273607582, + "kl": 0.10498046875, + "learning_rate": 2.4320218894875647e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2348 + }, + { + "completion_length": 986.3333435058594, + "epoch": 0.35807926829268294, + "grad_norm": 0.12301382730181366, + "kl": 0.06640625, + "learning_rate": 2.4313963645012246e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2349 + }, + { + "completion_length": 970.8333435058594, + "epoch": 0.35823170731707316, + "grad_norm": 0.13712805104667303, + "kl": 0.10693359375, + "learning_rate": 2.43077057579579e-06, + "loss": 0.0043, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2350 + }, + { + "completion_length": 641.6666870117188, + "epoch": 0.3583841463414634, + "grad_norm": 2.194739276241906, + "kl": 0.107666015625, + "learning_rate": 2.4301445235484497e-06, + "loss": 0.0043, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2351 + }, + { + "completion_length": 1316.5000610351562, + "epoch": 0.35853658536585364, + "grad_norm": 0.1210588094594156, + "kl": 0.07373046875, + "learning_rate": 2.4295182079364655e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2352 + }, + { + "completion_length": 1635.166748046875, + "epoch": 0.3586890243902439, + "grad_norm": 0.09743341699144688, + "kl": 0.084228515625, + "learning_rate": 2.4288916291371755e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2353 + }, + { + "completion_length": 1107.8333740234375, + "epoch": 0.35884146341463413, + "grad_norm": 0.1397247650465218, + "kl": 0.141357421875, + "learning_rate": 2.428264787327991e-06, + "loss": 0.0056, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2354 + }, + { + "completion_length": 903.8333435058594, + "epoch": 0.3589939024390244, + "grad_norm": 0.10980490883660791, + "kl": 0.0684814453125, + "learning_rate": 2.4276376826863978e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2355 + }, + { + "completion_length": 969.1667175292969, + "epoch": 0.3591463414634146, + "grad_norm": 0.19939295416707137, + "kl": 0.13623046875, + "learning_rate": 2.427010315389958e-06, + "loss": 0.0054, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2356 + }, + { + "completion_length": 1600.8333740234375, + "epoch": 0.3592987804878049, + "grad_norm": 0.1374064401077698, + "kl": 0.0733642578125, + "learning_rate": 2.4263826856163066e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2357 + }, + { + "completion_length": 1535.6666870117188, + "epoch": 0.3594512195121951, + "grad_norm": 0.11761464189222086, + "kl": 0.083740234375, + "learning_rate": 2.4257547935431526e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2358 + }, + { + "completion_length": 861.1666870117188, + "epoch": 0.3596036585365854, + "grad_norm": 0.13634810721742033, + "kl": 0.06689453125, + "learning_rate": 2.4251266393482792e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2359 + }, + { + "completion_length": 1832.0, + "epoch": 0.3597560975609756, + "grad_norm": 0.08053520027793332, + "kl": 0.0692138671875, + "learning_rate": 2.424498223209545e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2360 + }, + { + "completion_length": 1582.1666870117188, + "epoch": 0.35990853658536587, + "grad_norm": 1.7569790919067745, + "kl": 0.0657958984375, + "learning_rate": 2.4238695453048833e-06, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2361 + }, + { + "completion_length": 1475.5000610351562, + "epoch": 0.3600609756097561, + "grad_norm": 0.09887976986478143, + "kl": 0.08251953125, + "learning_rate": 2.4232406058122984e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2362 + }, + { + "completion_length": 2157.3333435058594, + "epoch": 0.36021341463414636, + "grad_norm": 0.08221000405603891, + "kl": 0.050048828125, + "learning_rate": 2.4226114049098715e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2363 + }, + { + "completion_length": 2472.6666870117188, + "epoch": 0.3603658536585366, + "grad_norm": 0.09363402336669376, + "kl": 0.0634765625, + "learning_rate": 2.4219819427757566e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2364 + }, + { + "completion_length": 2517.8334350585938, + "epoch": 0.36051829268292684, + "grad_norm": 0.060578606312838554, + "kl": 0.03302001953125, + "learning_rate": 2.4213522195881824e-06, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2365 + }, + { + "completion_length": 1698.0, + "epoch": 0.36067073170731706, + "grad_norm": 0.07760151550095727, + "kl": 0.05419921875, + "learning_rate": 2.420722235525451e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2366 + }, + { + "completion_length": 2803.5001220703125, + "epoch": 0.36082317073170733, + "grad_norm": 0.060722546341430365, + "kl": 0.03765869140625, + "learning_rate": 2.420091990765938e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2367 + }, + { + "completion_length": 1299.666748046875, + "epoch": 0.36097560975609755, + "grad_norm": 0.12124613074174423, + "kl": 0.0596923828125, + "learning_rate": 2.4194614854880937e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2368 + }, + { + "completion_length": 2450.3333435058594, + "epoch": 0.3611280487804878, + "grad_norm": 0.0826170868663445, + "kl": 0.06591796875, + "learning_rate": 2.4188307198704417e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2369 + }, + { + "completion_length": 1455.5, + "epoch": 0.36128048780487804, + "grad_norm": 0.15018593479647172, + "kl": 0.0780029296875, + "learning_rate": 2.41819969409158e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2370 + }, + { + "completion_length": 881.3333740234375, + "epoch": 0.3614329268292683, + "grad_norm": 0.15015151601766866, + "kl": 0.088134765625, + "learning_rate": 2.4175684083301786e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2371 + }, + { + "completion_length": 1881.5, + "epoch": 0.3615853658536585, + "grad_norm": 0.3315179646437129, + "kl": 0.0772705078125, + "learning_rate": 2.4169368627649823e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2372 + }, + { + "completion_length": 1502.1666870117188, + "epoch": 0.3617378048780488, + "grad_norm": 0.11355180807870632, + "kl": 0.074951171875, + "learning_rate": 2.41630505757481e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2373 + }, + { + "completion_length": 1942.6666870117188, + "epoch": 0.361890243902439, + "grad_norm": 0.07275414581060966, + "kl": 0.056884765625, + "learning_rate": 2.4156729929385526e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2374 + }, + { + "completion_length": 1038.6667175292969, + "epoch": 0.3620426829268293, + "grad_norm": 0.1591631113138564, + "kl": 0.064453125, + "learning_rate": 2.4150406690351762e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2375 + }, + { + "completion_length": 2124.3333740234375, + "epoch": 0.3621951219512195, + "grad_norm": 0.08342870092893277, + "kl": 0.068359375, + "learning_rate": 2.4144080860437184e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2376 + }, + { + "completion_length": 964.3333740234375, + "epoch": 0.3623475609756098, + "grad_norm": 0.21599857438632925, + "kl": 0.1025390625, + "learning_rate": 2.4137752441432914e-06, + "loss": 0.0041, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2377 + }, + { + "completion_length": 1700.0000915527344, + "epoch": 0.3625, + "grad_norm": 0.22223276531230027, + "kl": 0.097900390625, + "learning_rate": 2.4131421435130812e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2378 + }, + { + "completion_length": 1719.0, + "epoch": 0.36265243902439026, + "grad_norm": 0.11618271639525579, + "kl": 0.06884765625, + "learning_rate": 2.412508784332345e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2379 + }, + { + "completion_length": 1778.8333740234375, + "epoch": 0.3628048780487805, + "grad_norm": 0.09622825555334871, + "kl": 0.06591796875, + "learning_rate": 2.411875166780416e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2380 + }, + { + "completion_length": 843.0, + "epoch": 0.36295731707317075, + "grad_norm": 0.12143658605719698, + "kl": 0.089111328125, + "learning_rate": 2.411241291036698e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2381 + }, + { + "completion_length": 2292.6666870117188, + "epoch": 0.36310975609756097, + "grad_norm": 0.21170076223966833, + "kl": 0.0726318359375, + "learning_rate": 2.4106071572806693e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2382 + }, + { + "completion_length": 2769.0, + "epoch": 0.36326219512195124, + "grad_norm": 0.05641768582305198, + "kl": 0.0487060546875, + "learning_rate": 2.409972765691881e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2383 + }, + { + "completion_length": 753.0, + "epoch": 0.36341463414634145, + "grad_norm": 0.31315923709818383, + "kl": 0.10302734375, + "learning_rate": 2.409338116449957e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2384 + }, + { + "completion_length": 2022.166748046875, + "epoch": 0.3635670731707317, + "grad_norm": 0.13322024812096148, + "kl": 0.069091796875, + "learning_rate": 2.408703209734595e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2385 + }, + { + "completion_length": 1095.1667175292969, + "epoch": 0.36371951219512194, + "grad_norm": 0.11831625085530278, + "kl": 0.0703125, + "learning_rate": 2.4080680457255632e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2386 + }, + { + "completion_length": 1003.6667175292969, + "epoch": 0.3638719512195122, + "grad_norm": 0.10744549392941695, + "kl": 0.07275390625, + "learning_rate": 2.407432624602706e-06, + "loss": 0.0029, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2387 + }, + { + "completion_length": 1634.5, + "epoch": 0.36402439024390243, + "grad_norm": 0.4941599968809499, + "kl": 0.05908203125, + "learning_rate": 2.4067969465459383e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2388 + }, + { + "completion_length": 2373.3333740234375, + "epoch": 0.3641768292682927, + "grad_norm": 0.0870022397253736, + "kl": 0.07177734375, + "learning_rate": 2.4061610117352484e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2389 + }, + { + "completion_length": 879.8333740234375, + "epoch": 0.3643292682926829, + "grad_norm": 0.12402615395744461, + "kl": 0.086669921875, + "learning_rate": 2.405524820350698e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2390 + }, + { + "completion_length": 1432.3333740234375, + "epoch": 0.3644817073170732, + "grad_norm": 0.1138198129132943, + "kl": 0.106201171875, + "learning_rate": 2.4048883725724187e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2391 + }, + { + "completion_length": 1563.5000610351562, + "epoch": 0.3646341463414634, + "grad_norm": 0.0656951874952369, + "kl": 0.051513671875, + "learning_rate": 2.404251668580619e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2392 + }, + { + "completion_length": 1045.8333740234375, + "epoch": 0.3647865853658537, + "grad_norm": 0.1074696352952498, + "kl": 0.0693359375, + "learning_rate": 2.4036147085555767e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2393 + }, + { + "completion_length": 1430.5000610351562, + "epoch": 0.3649390243902439, + "grad_norm": 0.11410461783429718, + "kl": 0.0928955078125, + "learning_rate": 2.4029774926776435e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2394 + }, + { + "completion_length": 1761.8334350585938, + "epoch": 0.36509146341463417, + "grad_norm": 0.12370599756637372, + "kl": 0.083251953125, + "learning_rate": 2.402340021127242e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2395 + }, + { + "completion_length": 1350.1666870117188, + "epoch": 0.3652439024390244, + "grad_norm": 0.13682234217683006, + "kl": 0.08837890625, + "learning_rate": 2.4017022940848696e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2396 + }, + { + "completion_length": 1003.6666870117188, + "epoch": 0.36539634146341465, + "grad_norm": 0.10735490936751522, + "kl": 0.0635986328125, + "learning_rate": 2.401064311731094e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2397 + }, + { + "completion_length": 1246.8333740234375, + "epoch": 0.36554878048780487, + "grad_norm": 0.1141300983306637, + "kl": 0.0723876953125, + "learning_rate": 2.400426074246556e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2398 + }, + { + "completion_length": 1455.5000610351562, + "epoch": 0.36570121951219514, + "grad_norm": 0.10700242434663663, + "kl": 0.05517578125, + "learning_rate": 2.399787581811969e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2399 + }, + { + "completion_length": 924.0000305175781, + "epoch": 0.36585365853658536, + "grad_norm": 1.5557595237921524, + "kl": 0.087890625, + "learning_rate": 2.3991488346081183e-06, + "loss": 0.0035, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2400 + }, + { + "completion_length": 785.8333435058594, + "epoch": 0.36600609756097563, + "grad_norm": 0.13837462447836824, + "kl": 0.083984375, + "learning_rate": 2.3985098328158605e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2401 + }, + { + "completion_length": 1287.0, + "epoch": 0.36615853658536585, + "grad_norm": 0.15289554622064644, + "kl": 0.083984375, + "learning_rate": 2.3978705766161253e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2402 + }, + { + "completion_length": 3290.0, + "epoch": 0.3663109756097561, + "grad_norm": 0.07392115409721428, + "kl": 0.04644775390625, + "learning_rate": 2.3972310661899145e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2403 + }, + { + "completion_length": 1837.8333435058594, + "epoch": 0.36646341463414633, + "grad_norm": 0.1225998658454675, + "kl": 0.05938720703125, + "learning_rate": 2.3965913017183006e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2404 + }, + { + "completion_length": 1514.5, + "epoch": 0.3666158536585366, + "grad_norm": 0.11271115483471936, + "kl": 0.0648193359375, + "learning_rate": 2.3959512833824295e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2405 + }, + { + "completion_length": 2141.0001220703125, + "epoch": 0.3667682926829268, + "grad_norm": 0.2613501371750572, + "kl": 0.053955078125, + "learning_rate": 2.3953110113635186e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2406 + }, + { + "completion_length": 1219.3333740234375, + "epoch": 0.3669207317073171, + "grad_norm": 0.2054038439663933, + "kl": 0.07763671875, + "learning_rate": 2.3946704858428568e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2407 + }, + { + "completion_length": 1825.8333740234375, + "epoch": 0.3670731707317073, + "grad_norm": 0.11495513618776211, + "kl": 0.064208984375, + "learning_rate": 2.3940297070018048e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2408 + }, + { + "completion_length": 877.5000457763672, + "epoch": 0.3672256097560976, + "grad_norm": 0.1178803167161076, + "kl": 0.057861328125, + "learning_rate": 2.393388675021795e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2409 + }, + { + "completion_length": 1562.666748046875, + "epoch": 0.3673780487804878, + "grad_norm": 2.8443903290952743, + "kl": 0.088623046875, + "learning_rate": 2.3927473900843324e-06, + "loss": 0.0035, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2410 + }, + { + "completion_length": 746.1666870117188, + "epoch": 0.36753048780487807, + "grad_norm": 0.09672974964964534, + "kl": 0.05029296875, + "learning_rate": 2.3921058523709915e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2411 + }, + { + "completion_length": 2693.666748046875, + "epoch": 0.3676829268292683, + "grad_norm": 0.1297811392494611, + "kl": 0.057861328125, + "learning_rate": 2.3914640620634213e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2412 + }, + { + "completion_length": 1048.6666870117188, + "epoch": 0.36783536585365856, + "grad_norm": 0.12155367129452327, + "kl": 0.08251953125, + "learning_rate": 2.3908220193433397e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2413 + }, + { + "completion_length": 979.1666870117188, + "epoch": 0.3679878048780488, + "grad_norm": 0.3110915577208402, + "kl": 0.0760498046875, + "learning_rate": 2.390179724392537e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2414 + }, + { + "completion_length": 1692.8333740234375, + "epoch": 0.36814024390243905, + "grad_norm": 0.12086458732944383, + "kl": 0.085205078125, + "learning_rate": 2.3895371773928757e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2415 + }, + { + "completion_length": 1453.5000610351562, + "epoch": 0.36829268292682926, + "grad_norm": 0.10048845148729021, + "kl": 0.06201171875, + "learning_rate": 2.388894378526288e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2416 + }, + { + "completion_length": 1352.8333740234375, + "epoch": 0.36844512195121953, + "grad_norm": 0.14719986121248174, + "kl": 0.093017578125, + "learning_rate": 2.38825132797478e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2417 + }, + { + "completion_length": 1203.5, + "epoch": 0.36859756097560975, + "grad_norm": 0.13480100248373872, + "kl": 0.074462890625, + "learning_rate": 2.387608025920426e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2418 + }, + { + "completion_length": 2136.666717529297, + "epoch": 0.36875, + "grad_norm": 0.0926274731029561, + "kl": 0.0419921875, + "learning_rate": 2.3869644725453737e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2419 + }, + { + "completion_length": 1042.8333740234375, + "epoch": 0.36890243902439024, + "grad_norm": 0.20413137898698552, + "kl": 0.076904296875, + "learning_rate": 2.386320668031841e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2420 + }, + { + "completion_length": 1262.0000610351562, + "epoch": 0.3690548780487805, + "grad_norm": 0.13982871383979933, + "kl": 0.08349609375, + "learning_rate": 2.385676612562117e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2421 + }, + { + "completion_length": 1494.0000610351562, + "epoch": 0.3692073170731707, + "grad_norm": 0.13394231976804957, + "kl": 0.072265625, + "learning_rate": 2.385032306318563e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2422 + }, + { + "completion_length": 1387.6666870117188, + "epoch": 0.369359756097561, + "grad_norm": 0.09540889741483716, + "kl": 0.0721435546875, + "learning_rate": 2.3843877494836083e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2423 + }, + { + "completion_length": 1687.3333435058594, + "epoch": 0.3695121951219512, + "grad_norm": 0.17288078621443903, + "kl": 0.0771484375, + "learning_rate": 2.383742942239757e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2424 + }, + { + "completion_length": 1024.0, + "epoch": 0.3696646341463415, + "grad_norm": 1.430028137688774, + "kl": 0.088134765625, + "learning_rate": 2.383097884769582e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2425 + }, + { + "completion_length": 1996.8334350585938, + "epoch": 0.3698170731707317, + "grad_norm": 0.07918041430274218, + "kl": 0.060791015625, + "learning_rate": 2.3824525772557267e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2426 + }, + { + "completion_length": 939.5, + "epoch": 0.369969512195122, + "grad_norm": 0.21597170694228418, + "kl": 0.106689453125, + "learning_rate": 2.381807019880906e-06, + "loss": 0.0043, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2427 + }, + { + "completion_length": 2240.5, + "epoch": 0.3701219512195122, + "grad_norm": 0.07626629663558106, + "kl": 0.047119140625, + "learning_rate": 2.3811612128279053e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2428 + }, + { + "completion_length": 1085.0000305175781, + "epoch": 0.37027439024390246, + "grad_norm": 1.6059558302131083, + "kl": 0.060546875, + "learning_rate": 2.380515156279582e-06, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2429 + }, + { + "completion_length": 1564.8333435058594, + "epoch": 0.3704268292682927, + "grad_norm": 0.13917753318152548, + "kl": 0.095703125, + "learning_rate": 2.3798688504188618e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2430 + }, + { + "completion_length": 820.3333435058594, + "epoch": 0.37057926829268295, + "grad_norm": 0.15292133277320075, + "kl": 0.100830078125, + "learning_rate": 2.3792222954287424e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2431 + }, + { + "completion_length": 2217.666748046875, + "epoch": 0.37073170731707317, + "grad_norm": 0.18253032994187599, + "kl": 0.068359375, + "learning_rate": 2.3785754914922923e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2432 + }, + { + "completion_length": 2978.5, + "epoch": 0.37088414634146344, + "grad_norm": 0.08827171625089873, + "kl": 0.0687255859375, + "learning_rate": 2.3779284387926494e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2433 + }, + { + "completion_length": 2308.3333435058594, + "epoch": 0.37103658536585366, + "grad_norm": 0.08867446418590975, + "kl": 0.0390625, + "learning_rate": 2.3772811375130235e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2434 + }, + { + "completion_length": 2806.8333740234375, + "epoch": 0.3711890243902439, + "grad_norm": 0.10274001180811579, + "kl": 0.0667724609375, + "learning_rate": 2.376633587836693e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2435 + }, + { + "completion_length": 2149.5001220703125, + "epoch": 0.37134146341463414, + "grad_norm": 0.2565357813109716, + "kl": 0.08154296875, + "learning_rate": 2.375985789947008e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2436 + }, + { + "completion_length": 1422.3333740234375, + "epoch": 0.3714939024390244, + "grad_norm": 0.24651152748830896, + "kl": 0.1162109375, + "learning_rate": 2.375337744027389e-06, + "loss": 0.0047, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2437 + }, + { + "completion_length": 1395.3333740234375, + "epoch": 0.37164634146341463, + "grad_norm": 0.14765383118967196, + "kl": 0.0849609375, + "learning_rate": 2.374689450261325e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2438 + }, + { + "completion_length": 1304.6666870117188, + "epoch": 0.3717987804878049, + "grad_norm": 0.1365454036161743, + "kl": 0.111083984375, + "learning_rate": 2.374040908832377e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2439 + }, + { + "completion_length": 1549.5000610351562, + "epoch": 0.3719512195121951, + "grad_norm": 2.1505976364523782, + "kl": 0.081787109375, + "learning_rate": 2.3733921199241755e-06, + "loss": 0.0033, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2440 + }, + { + "completion_length": 2525.8333740234375, + "epoch": 0.3721036585365854, + "grad_norm": 0.1469703833800669, + "kl": 0.05078125, + "learning_rate": 2.3727430837204213e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2441 + }, + { + "completion_length": 2795.33349609375, + "epoch": 0.3722560975609756, + "grad_norm": 0.07091819289741198, + "kl": 0.0614013671875, + "learning_rate": 2.372093800404884e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2442 + }, + { + "completion_length": 1054.3333435058594, + "epoch": 0.3724085365853659, + "grad_norm": 0.17622348575435992, + "kl": 0.090576171875, + "learning_rate": 2.3714442701614052e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2443 + }, + { + "completion_length": 2617.0, + "epoch": 0.3725609756097561, + "grad_norm": 0.05302465512675126, + "kl": 0.0404052734375, + "learning_rate": 2.370794493173895e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2444 + }, + { + "completion_length": 2576.1666870117188, + "epoch": 0.37271341463414637, + "grad_norm": 0.07555596429382723, + "kl": 0.03887939453125, + "learning_rate": 2.3701444696263337e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2445 + }, + { + "completion_length": 2112.5000915527344, + "epoch": 0.3728658536585366, + "grad_norm": 0.7403534487797467, + "kl": 0.07470703125, + "learning_rate": 2.369494199702771e-06, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2446 + }, + { + "completion_length": 3035.1666870117188, + "epoch": 0.37301829268292686, + "grad_norm": 0.0821113192335491, + "kl": 0.053955078125, + "learning_rate": 2.368843683587328e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2447 + }, + { + "completion_length": 1268.1666870117188, + "epoch": 0.37317073170731707, + "grad_norm": 1.6839111618994338, + "kl": 0.113525390625, + "learning_rate": 2.3681929214641924e-06, + "loss": 0.0045, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2448 + }, + { + "completion_length": 3998.166748046875, + "epoch": 0.37332317073170734, + "grad_norm": 0.04642325908849297, + "kl": 0.036865234375, + "learning_rate": 2.367541913517625e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2449 + }, + { + "completion_length": 2122.5, + "epoch": 0.37347560975609756, + "grad_norm": 0.10682757740187096, + "kl": 0.0770263671875, + "learning_rate": 2.3668906599319546e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2450 + }, + { + "completion_length": 2352.0001220703125, + "epoch": 0.37362804878048783, + "grad_norm": 0.08077142302147124, + "kl": 0.08154296875, + "learning_rate": 2.366239160891579e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2451 + }, + { + "completion_length": 2612.666748046875, + "epoch": 0.37378048780487805, + "grad_norm": 0.06763922350698229, + "kl": 0.05712890625, + "learning_rate": 2.365587416580966e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2452 + }, + { + "completion_length": 3870.3333740234375, + "epoch": 0.3739329268292683, + "grad_norm": 0.053106233983674395, + "kl": 0.0367431640625, + "learning_rate": 2.3649354271846535e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2453 + }, + { + "completion_length": 4096.0, + "epoch": 0.37408536585365854, + "grad_norm": 0.04985546622928847, + "kl": 0.0352783203125, + "learning_rate": 2.3642831928872475e-06, + "loss": 0.0014, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2454 + }, + { + "completion_length": 3793.5, + "epoch": 0.3742378048780488, + "grad_norm": 0.056051513421516874, + "kl": 0.0357666015625, + "learning_rate": 2.3636307138734253e-06, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2455 + }, + { + "completion_length": 3749.3333740234375, + "epoch": 0.374390243902439, + "grad_norm": 0.06065799648971914, + "kl": 0.0452880859375, + "learning_rate": 2.362977990327931e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2456 + }, + { + "completion_length": 2479.5000610351562, + "epoch": 0.3745426829268293, + "grad_norm": 0.07852901328613515, + "kl": 0.053955078125, + "learning_rate": 2.36232502243558e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2457 + }, + { + "completion_length": 2694.1666870117188, + "epoch": 0.3746951219512195, + "grad_norm": 0.7530333685906291, + "kl": 0.060791015625, + "learning_rate": 2.3616718103812557e-06, + "loss": 0.0024, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 2458 + }, + { + "completion_length": 1781.0000610351562, + "epoch": 0.3748475609756098, + "grad_norm": 0.1275626994734297, + "kl": 0.063720703125, + "learning_rate": 2.3610183543499117e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2459 + }, + { + "completion_length": 1175.1666870117188, + "epoch": 0.375, + "grad_norm": 0.10122067278409867, + "kl": 0.050537109375, + "learning_rate": 2.3603646545265692e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2460 + }, + { + "completion_length": 3264.666748046875, + "epoch": 0.3751524390243902, + "grad_norm": 0.0675517378289351, + "kl": 0.05126953125, + "learning_rate": 2.3597107110963195e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2461 + }, + { + "completion_length": 2645.5001220703125, + "epoch": 0.3753048780487805, + "grad_norm": 0.8340700110323038, + "kl": 0.06689453125, + "learning_rate": 2.359056524244323e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2462 + }, + { + "completion_length": 3836.8333740234375, + "epoch": 0.3754573170731707, + "grad_norm": 0.05037428144545435, + "kl": 0.0528564453125, + "learning_rate": 2.3584020941558082e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2463 + }, + { + "completion_length": 1512.1666870117188, + "epoch": 0.375609756097561, + "grad_norm": 0.22245937126151155, + "kl": 0.08447265625, + "learning_rate": 2.357747421016073e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2464 + }, + { + "completion_length": 1077.1667175292969, + "epoch": 0.3757621951219512, + "grad_norm": 0.1396342131644541, + "kl": 0.067138671875, + "learning_rate": 2.3570925050104847e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2465 + }, + { + "completion_length": 1970.166748046875, + "epoch": 0.37591463414634146, + "grad_norm": 0.14563512212903384, + "kl": 0.093017578125, + "learning_rate": 2.3564373463244773e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2466 + }, + { + "completion_length": 784.1666870117188, + "epoch": 0.3760670731707317, + "grad_norm": 2.0387131573915753, + "kl": 0.110595703125, + "learning_rate": 2.355781945143556e-06, + "loss": 0.0044, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2467 + }, + { + "completion_length": 2021.8333740234375, + "epoch": 0.37621951219512195, + "grad_norm": 0.2973999524157707, + "kl": 0.090087890625, + "learning_rate": 2.355126301653293e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2468 + }, + { + "completion_length": 1697.3333435058594, + "epoch": 0.37637195121951217, + "grad_norm": 0.10106514297275491, + "kl": 0.091552734375, + "learning_rate": 2.35447041603933e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2469 + }, + { + "completion_length": 910.3333740234375, + "epoch": 0.37652439024390244, + "grad_norm": 0.16241057222063196, + "kl": 0.0762939453125, + "learning_rate": 2.3538142884873772e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2470 + }, + { + "completion_length": 1683.3333740234375, + "epoch": 0.37667682926829266, + "grad_norm": 0.16116415461498285, + "kl": 0.0771484375, + "learning_rate": 2.3531579191832125e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2471 + }, + { + "completion_length": 1316.5000610351562, + "epoch": 0.37682926829268293, + "grad_norm": 2.3083362988390257, + "kl": 0.084716796875, + "learning_rate": 2.3525013083126835e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2472 + }, + { + "completion_length": 2263.3334350585938, + "epoch": 0.37698170731707314, + "grad_norm": 0.4630964981769272, + "kl": 0.1024169921875, + "learning_rate": 2.3518444560617042e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2473 + }, + { + "completion_length": 2273.3333740234375, + "epoch": 0.3771341463414634, + "grad_norm": 0.08076091525862981, + "kl": 0.0654296875, + "learning_rate": 2.35118736261626e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2474 + }, + { + "completion_length": 1759.6666870117188, + "epoch": 0.37728658536585363, + "grad_norm": 0.1455667373162178, + "kl": 0.105712890625, + "learning_rate": 2.350530028162401e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2475 + }, + { + "completion_length": 1457.666748046875, + "epoch": 0.3774390243902439, + "grad_norm": 0.0890974583667654, + "kl": 0.069580078125, + "learning_rate": 2.349872452886249e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2476 + }, + { + "completion_length": 783.5000610351562, + "epoch": 0.3775914634146341, + "grad_norm": 0.17452297974243391, + "kl": 0.12158203125, + "learning_rate": 2.349214636973991e-06, + "loss": 0.0048, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2477 + }, + { + "completion_length": 1963.666748046875, + "epoch": 0.3777439024390244, + "grad_norm": 0.13350493274661343, + "kl": 0.09912109375, + "learning_rate": 2.348556580611884e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2478 + }, + { + "completion_length": 1741.3333740234375, + "epoch": 0.3778963414634146, + "grad_norm": 0.08386951862651036, + "kl": 0.0908203125, + "learning_rate": 2.3478982839862528e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2479 + }, + { + "completion_length": 1468.1666870117188, + "epoch": 0.3780487804878049, + "grad_norm": 0.1370568926332205, + "kl": 0.120849609375, + "learning_rate": 2.34723974728349e-06, + "loss": 0.0048, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2480 + }, + { + "completion_length": 2092.3334350585938, + "epoch": 0.3782012195121951, + "grad_norm": 0.099238832436959, + "kl": 0.079345703125, + "learning_rate": 2.3465809706900565e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2481 + }, + { + "completion_length": 1553.5000610351562, + "epoch": 0.37835365853658537, + "grad_norm": 0.09313600265930258, + "kl": 0.064208984375, + "learning_rate": 2.3459219543924796e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2482 + }, + { + "completion_length": 813.5, + "epoch": 0.3785060975609756, + "grad_norm": 0.19605846390865767, + "kl": 0.0863037109375, + "learning_rate": 2.345262698577357e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2483 + }, + { + "completion_length": 2258.8333740234375, + "epoch": 0.37865853658536586, + "grad_norm": 0.06429307367722112, + "kl": 0.075927734375, + "learning_rate": 2.3446032034313518e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2484 + }, + { + "completion_length": 1339.8333740234375, + "epoch": 0.3788109756097561, + "grad_norm": 0.07576699166457296, + "kl": 0.0703125, + "learning_rate": 2.3439434691411967e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2485 + }, + { + "completion_length": 2196.5, + "epoch": 0.37896341463414634, + "grad_norm": 0.06455714732247272, + "kl": 0.0565185546875, + "learning_rate": 2.343283495893691e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2486 + }, + { + "completion_length": 2246.5, + "epoch": 0.37911585365853656, + "grad_norm": 0.08211559742393196, + "kl": 0.07568359375, + "learning_rate": 2.342623283875702e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2487 + }, + { + "completion_length": 1499.1666870117188, + "epoch": 0.37926829268292683, + "grad_norm": 0.141478815735072, + "kl": 0.09765625, + "learning_rate": 2.341962833274165e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2488 + }, + { + "completion_length": 2859.83349609375, + "epoch": 0.37942073170731705, + "grad_norm": 0.04608274045746727, + "kl": 0.065673828125, + "learning_rate": 2.341302144276082e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2489 + }, + { + "completion_length": 1914.166748046875, + "epoch": 0.3795731707317073, + "grad_norm": 0.17889088478410983, + "kl": 0.095458984375, + "learning_rate": 2.3406412170685237e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2490 + }, + { + "completion_length": 1870.1666870117188, + "epoch": 0.37972560975609754, + "grad_norm": 0.3050404442639703, + "kl": 0.0880126953125, + "learning_rate": 2.3399800518386268e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2491 + }, + { + "completion_length": 1215.3333435058594, + "epoch": 0.3798780487804878, + "grad_norm": 0.13637857020807484, + "kl": 0.094482421875, + "learning_rate": 2.339318648773596e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2492 + }, + { + "completion_length": 670.0, + "epoch": 0.380030487804878, + "grad_norm": 0.14507699100546811, + "kl": 0.0849609375, + "learning_rate": 2.3386570080607044e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2493 + }, + { + "completion_length": 1958.0, + "epoch": 0.3801829268292683, + "grad_norm": 0.1368306978671216, + "kl": 0.0888671875, + "learning_rate": 2.337995129887291e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2494 + }, + { + "completion_length": 702.1666870117188, + "epoch": 0.3803353658536585, + "grad_norm": 0.2595998040387128, + "kl": 0.1396484375, + "learning_rate": 2.337333014440762e-06, + "loss": 0.0056, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2495 + }, + { + "completion_length": 1999.3333740234375, + "epoch": 0.3804878048780488, + "grad_norm": 0.09360198822356934, + "kl": 0.08984375, + "learning_rate": 2.336670661908592e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2496 + }, + { + "completion_length": 1206.8333740234375, + "epoch": 0.380640243902439, + "grad_norm": 0.07695710343495547, + "kl": 0.0498046875, + "learning_rate": 2.3360080724783214e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2497 + }, + { + "completion_length": 802.5000305175781, + "epoch": 0.3807926829268293, + "grad_norm": 0.16429983495815073, + "kl": 0.110595703125, + "learning_rate": 2.3353452463375586e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2498 + }, + { + "completion_length": 2661.166748046875, + "epoch": 0.3809451219512195, + "grad_norm": 0.0862284762518074, + "kl": 0.0635986328125, + "learning_rate": 2.3346821836739787e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2499 + }, + { + "completion_length": 2084.8333435058594, + "epoch": 0.38109756097560976, + "grad_norm": 0.09402813264096285, + "kl": 0.06982421875, + "learning_rate": 2.3340188846753245e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2500 + }, + { + "completion_length": 1375.5, + "epoch": 0.38125, + "grad_norm": 1.7505262341553265, + "kl": 0.1103515625, + "learning_rate": 2.3333553495294033e-06, + "loss": 0.0044, + "reward": 0.8333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/format_reward": 0.0, + "step": 2501 + }, + { + "completion_length": 2065.166748046875, + "epoch": 0.38140243902439025, + "grad_norm": 0.08727625024306719, + "kl": 0.0570068359375, + "learning_rate": 2.3326915784240923e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2502 + }, + { + "completion_length": 1589.0000915527344, + "epoch": 0.38155487804878047, + "grad_norm": 2.232180517673459, + "kl": 0.10009765625, + "learning_rate": 2.332027571547334e-06, + "loss": 0.004, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2503 + }, + { + "completion_length": 1505.3333435058594, + "epoch": 0.38170731707317074, + "grad_norm": 0.09844845855468766, + "kl": 0.064697265625, + "learning_rate": 2.3313633290871373e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2504 + }, + { + "completion_length": 859.8333740234375, + "epoch": 0.38185975609756095, + "grad_norm": 0.10326414068125989, + "kl": 0.081787109375, + "learning_rate": 2.3306988512315785e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2505 + }, + { + "completion_length": 1611.0000610351562, + "epoch": 0.3820121951219512, + "grad_norm": 0.34962012589478536, + "kl": 0.091796875, + "learning_rate": 2.3300341381688e-06, + "loss": 0.0037, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2506 + }, + { + "completion_length": 1300.0000610351562, + "epoch": 0.38216463414634144, + "grad_norm": 0.17660213721541626, + "kl": 0.078125, + "learning_rate": 2.329369190087013e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2507 + }, + { + "completion_length": 1421.166748046875, + "epoch": 0.3823170731707317, + "grad_norm": 0.16222097340521718, + "kl": 0.092041015625, + "learning_rate": 2.328704007174491e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2508 + }, + { + "completion_length": 1978.0000610351562, + "epoch": 0.38246951219512193, + "grad_norm": 0.13692106233189502, + "kl": 0.093017578125, + "learning_rate": 2.328038589619578e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2509 + }, + { + "completion_length": 2164.8333740234375, + "epoch": 0.3826219512195122, + "grad_norm": 0.12404856221319031, + "kl": 0.080810546875, + "learning_rate": 2.3273729376106823e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2510 + }, + { + "completion_length": 1223.8333435058594, + "epoch": 0.3827743902439024, + "grad_norm": 0.20133652719032713, + "kl": 0.09814453125, + "learning_rate": 2.3267070513362792e-06, + "loss": 0.0039, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2511 + }, + { + "completion_length": 1675.5001220703125, + "epoch": 0.3829268292682927, + "grad_norm": 0.11572743391938667, + "kl": 0.0792236328125, + "learning_rate": 2.3260409309849103e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2512 + }, + { + "completion_length": 1511.1667175292969, + "epoch": 0.3830792682926829, + "grad_norm": 0.11352604459509218, + "kl": 0.068359375, + "learning_rate": 2.325374576745183e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2513 + }, + { + "completion_length": 1454.1666870117188, + "epoch": 0.3832317073170732, + "grad_norm": 0.25267522571050094, + "kl": 0.075439453125, + "learning_rate": 2.324707988805772e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2514 + }, + { + "completion_length": 1933.3333740234375, + "epoch": 0.3833841463414634, + "grad_norm": 0.07904642597867516, + "kl": 0.0621337890625, + "learning_rate": 2.324041167355417e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2515 + }, + { + "completion_length": 1044.5, + "epoch": 0.38353658536585367, + "grad_norm": 1.8724674286081349, + "kl": 0.097412109375, + "learning_rate": 2.323374112582925e-06, + "loss": 0.0039, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2516 + }, + { + "completion_length": 2158.166748046875, + "epoch": 0.3836890243902439, + "grad_norm": 0.1024990442139722, + "kl": 0.0699462890625, + "learning_rate": 2.3227068246771677e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2517 + }, + { + "completion_length": 1158.0000457763672, + "epoch": 0.38384146341463415, + "grad_norm": 0.24425168953439796, + "kl": 0.075927734375, + "learning_rate": 2.3220393038270844e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2518 + }, + { + "completion_length": 2137.0000610351562, + "epoch": 0.38399390243902437, + "grad_norm": 0.12475927156497142, + "kl": 0.0828857421875, + "learning_rate": 2.3213715502216785e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2519 + }, + { + "completion_length": 2785.166748046875, + "epoch": 0.38414634146341464, + "grad_norm": 0.07501623995356735, + "kl": 0.065185546875, + "learning_rate": 2.3207035640500206e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2520 + }, + { + "completion_length": 1734.6667175292969, + "epoch": 0.38429878048780486, + "grad_norm": 0.12468023158428745, + "kl": 0.07470703125, + "learning_rate": 2.3200353455012474e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2521 + }, + { + "completion_length": 2117.5001220703125, + "epoch": 0.38445121951219513, + "grad_norm": 0.08355623599912268, + "kl": 0.083740234375, + "learning_rate": 2.31936689476456e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2522 + }, + { + "completion_length": 1834.0000915527344, + "epoch": 0.38460365853658535, + "grad_norm": 0.12920003448985207, + "kl": 0.08251953125, + "learning_rate": 2.318698212029227e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2523 + }, + { + "completion_length": 3874.3333740234375, + "epoch": 0.3847560975609756, + "grad_norm": 0.034677553136811846, + "kl": 0.0462646484375, + "learning_rate": 2.3180292974845807e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2524 + }, + { + "completion_length": 2682.6666870117188, + "epoch": 0.38490853658536583, + "grad_norm": 0.06099959311187001, + "kl": 0.0572509765625, + "learning_rate": 2.317360151320021e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2525 + }, + { + "completion_length": 1453.3333740234375, + "epoch": 0.3850609756097561, + "grad_norm": 1.7152516228935106, + "kl": 0.107666015625, + "learning_rate": 2.3166907737250116e-06, + "loss": 0.0043, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2526 + }, + { + "completion_length": 2104.8333740234375, + "epoch": 0.3852134146341463, + "grad_norm": 1.6587770441642145, + "kl": 0.08447265625, + "learning_rate": 2.3160211648890835e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2527 + }, + { + "completion_length": 4067.666748046875, + "epoch": 0.3853658536585366, + "grad_norm": 0.03906160721505127, + "kl": 0.0523681640625, + "learning_rate": 2.315351325001832e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2528 + }, + { + "completion_length": 3362.0001220703125, + "epoch": 0.3855182926829268, + "grad_norm": 0.09687039267534261, + "kl": 0.056396484375, + "learning_rate": 2.3146812542529177e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2529 + }, + { + "completion_length": 3568.8333740234375, + "epoch": 0.3856707317073171, + "grad_norm": 0.05587751573581902, + "kl": 0.0628662109375, + "learning_rate": 2.3140109528320677e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2530 + }, + { + "completion_length": 2843.666748046875, + "epoch": 0.3858231707317073, + "grad_norm": 0.0517492793498564, + "kl": 0.0587158203125, + "learning_rate": 2.3133404209290728e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2531 + }, + { + "completion_length": 1231.0000610351562, + "epoch": 0.38597560975609757, + "grad_norm": 0.12129033625554349, + "kl": 0.110107421875, + "learning_rate": 2.3126696587337903e-06, + "loss": 0.0044, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2532 + }, + { + "completion_length": 2463.666748046875, + "epoch": 0.3861280487804878, + "grad_norm": 0.14448591460530505, + "kl": 0.0753173828125, + "learning_rate": 2.311998666436143e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2533 + }, + { + "completion_length": 1192.3333740234375, + "epoch": 0.38628048780487806, + "grad_norm": 0.10795786658395146, + "kl": 0.1044921875, + "learning_rate": 2.311327444226117e-06, + "loss": 0.0042, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2534 + }, + { + "completion_length": 3712.3333740234375, + "epoch": 0.3864329268292683, + "grad_norm": 0.06097722425680558, + "kl": 0.0662841796875, + "learning_rate": 2.310655992293766e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2535 + }, + { + "completion_length": 2403.0000610351562, + "epoch": 0.38658536585365855, + "grad_norm": 0.06890470453411746, + "kl": 0.0557861328125, + "learning_rate": 2.3099843108292062e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2536 + }, + { + "completion_length": 2086.3334350585938, + "epoch": 0.38673780487804876, + "grad_norm": 0.09992841891346116, + "kl": 0.090087890625, + "learning_rate": 2.309312400022621e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2537 + }, + { + "completion_length": 2304.3333740234375, + "epoch": 0.38689024390243903, + "grad_norm": 0.11200111895792396, + "kl": 0.0760498046875, + "learning_rate": 2.3086402600642578e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2538 + }, + { + "completion_length": 3429.0001220703125, + "epoch": 0.38704268292682925, + "grad_norm": 0.08193928912575742, + "kl": 0.055908203125, + "learning_rate": 2.3079678911444274e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2539 + }, + { + "completion_length": 1869.0, + "epoch": 0.3871951219512195, + "grad_norm": 0.13081574581511352, + "kl": 0.10693359375, + "learning_rate": 2.3072952934535087e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2540 + }, + { + "completion_length": 3167.666748046875, + "epoch": 0.38734756097560974, + "grad_norm": 0.06416299684610177, + "kl": 0.058837890625, + "learning_rate": 2.3066224671819426e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2541 + }, + { + "completion_length": 1851.1666870117188, + "epoch": 0.3875, + "grad_norm": 0.11031325574999541, + "kl": 0.096435546875, + "learning_rate": 2.305949412520236e-06, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2542 + }, + { + "completion_length": 870.0, + "epoch": 0.3876524390243902, + "grad_norm": 1.828619266261958, + "kl": 0.11474609375, + "learning_rate": 2.3052761296589593e-06, + "loss": 0.0046, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2543 + }, + { + "completion_length": 1750.5001220703125, + "epoch": 0.3878048780487805, + "grad_norm": 2.230201649109286, + "kl": 0.0648193359375, + "learning_rate": 2.3046026187887498e-06, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2544 + }, + { + "completion_length": 3290.5, + "epoch": 0.3879573170731707, + "grad_norm": 0.32450341964893453, + "kl": 0.0513916015625, + "learning_rate": 2.303928880100307e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2545 + }, + { + "completion_length": 3797.0, + "epoch": 0.388109756097561, + "grad_norm": 0.03596285518997612, + "kl": 0.0438232421875, + "learning_rate": 2.3032549137843964e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2546 + }, + { + "completion_length": 3562.33349609375, + "epoch": 0.3882621951219512, + "grad_norm": 0.0483979117127073, + "kl": 0.050537109375, + "learning_rate": 2.302580720031847e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2547 + }, + { + "completion_length": 3460.3333740234375, + "epoch": 0.3884146341463415, + "grad_norm": 0.06064651302195683, + "kl": 0.0504150390625, + "learning_rate": 2.301906299033552e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2548 + }, + { + "completion_length": 2446.5, + "epoch": 0.3885670731707317, + "grad_norm": 0.19517484005101118, + "kl": 0.08935546875, + "learning_rate": 2.30123165098047e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2549 + }, + { + "completion_length": 1227.6667175292969, + "epoch": 0.38871951219512196, + "grad_norm": 0.1624090383351512, + "kl": 0.0745849609375, + "learning_rate": 2.300556776063624e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2550 + }, + { + "completion_length": 798.5000305175781, + "epoch": 0.3888719512195122, + "grad_norm": 0.16850825374307585, + "kl": 0.105224609375, + "learning_rate": 2.2998816744740996e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2551 + }, + { + "completion_length": 1719.5000610351562, + "epoch": 0.38902439024390245, + "grad_norm": 0.15628187614564978, + "kl": 0.079833984375, + "learning_rate": 2.2992063464030482e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2552 + }, + { + "completion_length": 2492.3333435058594, + "epoch": 0.38917682926829267, + "grad_norm": 0.09289995404697723, + "kl": 0.080810546875, + "learning_rate": 2.298530792041685e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2553 + }, + { + "completion_length": 1541.3334350585938, + "epoch": 0.38932926829268294, + "grad_norm": 0.11822920388703638, + "kl": 0.083984375, + "learning_rate": 2.297855011581289e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2554 + }, + { + "completion_length": 1287.3333740234375, + "epoch": 0.38948170731707316, + "grad_norm": 0.12329134240595194, + "kl": 0.07763671875, + "learning_rate": 2.2971790052132026e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2555 + }, + { + "completion_length": 1957.3333740234375, + "epoch": 0.3896341463414634, + "grad_norm": 1.6243969579735034, + "kl": 0.08251953125, + "learning_rate": 2.2965027731288335e-06, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2556 + }, + { + "completion_length": 2496.8333435058594, + "epoch": 0.38978658536585364, + "grad_norm": 0.12859783512971495, + "kl": 0.0728759765625, + "learning_rate": 2.2958263155196517e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2557 + }, + { + "completion_length": 1092.6666870117188, + "epoch": 0.3899390243902439, + "grad_norm": 0.12078889617785032, + "kl": 0.090087890625, + "learning_rate": 2.2951496325771927e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2558 + }, + { + "completion_length": 1126.3333740234375, + "epoch": 0.39009146341463413, + "grad_norm": 0.10519786683040004, + "kl": 0.0921630859375, + "learning_rate": 2.2944727244930553e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2559 + }, + { + "completion_length": 1165.8333740234375, + "epoch": 0.3902439024390244, + "grad_norm": 0.13826654434062413, + "kl": 0.094482421875, + "learning_rate": 2.293795591458901e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2560 + }, + { + "completion_length": 1937.666748046875, + "epoch": 0.3903963414634146, + "grad_norm": 0.15769195420814125, + "kl": 0.099853515625, + "learning_rate": 2.293118233666456e-06, + "loss": 0.004, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2561 + }, + { + "completion_length": 1850.8333740234375, + "epoch": 0.3905487804878049, + "grad_norm": 0.7799044786412108, + "kl": 0.123291015625, + "learning_rate": 2.292440651307511e-06, + "loss": 0.0049, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2562 + }, + { + "completion_length": 947.6666870117188, + "epoch": 0.3907012195121951, + "grad_norm": 0.28643449918817626, + "kl": 0.133056640625, + "learning_rate": 2.291762844573918e-06, + "loss": 0.0053, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2563 + }, + { + "completion_length": 1554.5000610351562, + "epoch": 0.3908536585365854, + "grad_norm": 0.12971363240122274, + "kl": 0.096435546875, + "learning_rate": 2.291084813657594e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2564 + }, + { + "completion_length": 2117.5000610351562, + "epoch": 0.3910060975609756, + "grad_norm": 0.10108310581995271, + "kl": 0.091552734375, + "learning_rate": 2.29040655875052e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2565 + }, + { + "completion_length": 1395.8333740234375, + "epoch": 0.39115853658536587, + "grad_norm": 0.1456067244334496, + "kl": 0.086669921875, + "learning_rate": 2.2897280800447382e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2566 + }, + { + "completion_length": 1798.8333740234375, + "epoch": 0.3913109756097561, + "grad_norm": 0.16900488265110852, + "kl": 0.1005859375, + "learning_rate": 2.2890493777323573e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2567 + }, + { + "completion_length": 1536.6666870117188, + "epoch": 0.39146341463414636, + "grad_norm": 0.13072559015133964, + "kl": 0.10107421875, + "learning_rate": 2.288370452005547e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2568 + }, + { + "completion_length": 1513.0, + "epoch": 0.3916158536585366, + "grad_norm": 0.11229643484641369, + "kl": 0.103271484375, + "learning_rate": 2.2876913030565403e-06, + "loss": 0.0041, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2569 + }, + { + "completion_length": 1888.0000610351562, + "epoch": 0.39176829268292684, + "grad_norm": 0.17072394072906052, + "kl": 0.1005859375, + "learning_rate": 2.287011931077635e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2570 + }, + { + "completion_length": 733.0000305175781, + "epoch": 0.39192073170731706, + "grad_norm": 0.2128494273312088, + "kl": 0.111572265625, + "learning_rate": 2.2863323362611894e-06, + "loss": 0.0045, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2571 + }, + { + "completion_length": 2013.5, + "epoch": 0.39207317073170733, + "grad_norm": 0.0757081739796923, + "kl": 0.0665283203125, + "learning_rate": 2.2856525187996287e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2572 + }, + { + "completion_length": 1985.8333435058594, + "epoch": 0.39222560975609755, + "grad_norm": 0.0814969679312668, + "kl": 0.059326171875, + "learning_rate": 2.2849724788854375e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2573 + }, + { + "completion_length": 2109.8333740234375, + "epoch": 0.3923780487804878, + "grad_norm": 0.11052417429669432, + "kl": 0.083251953125, + "learning_rate": 2.2842922167111663e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2574 + }, + { + "completion_length": 989.3333740234375, + "epoch": 0.39253048780487804, + "grad_norm": 2.005957564639874, + "kl": 0.108154296875, + "learning_rate": 2.283611732469425e-06, + "loss": 0.0043, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2575 + }, + { + "completion_length": 2044.166748046875, + "epoch": 0.3926829268292683, + "grad_norm": 0.13306547358935727, + "kl": 0.07373046875, + "learning_rate": 2.2829310263528907e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2576 + }, + { + "completion_length": 2858.166748046875, + "epoch": 0.3928353658536585, + "grad_norm": 0.0547930413318994, + "kl": 0.0582275390625, + "learning_rate": 2.2822500985542994e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2577 + }, + { + "completion_length": 2598.0, + "epoch": 0.3929878048780488, + "grad_norm": 0.2337386620612337, + "kl": 0.0821533203125, + "learning_rate": 2.2815689492664522e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2578 + }, + { + "completion_length": 3070.5, + "epoch": 0.393140243902439, + "grad_norm": 0.06055520318916946, + "kl": 0.0621337890625, + "learning_rate": 2.2808875786822123e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2579 + }, + { + "completion_length": 998.6666870117188, + "epoch": 0.3932926829268293, + "grad_norm": 0.15123062918722974, + "kl": 0.11572265625, + "learning_rate": 2.2802059869945057e-06, + "loss": 0.0046, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2580 + }, + { + "completion_length": 3071.3333740234375, + "epoch": 0.3934451219512195, + "grad_norm": 0.06923232640001778, + "kl": 0.0650634765625, + "learning_rate": 2.279524174396321e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2581 + }, + { + "completion_length": 3067.5, + "epoch": 0.3935975609756098, + "grad_norm": 0.12258047429220756, + "kl": 0.06591796875, + "learning_rate": 2.2788421410807087e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2582 + }, + { + "completion_length": 2158.8333435058594, + "epoch": 0.39375, + "grad_norm": 0.07489197122609359, + "kl": 0.068359375, + "learning_rate": 2.2781598872407826e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2583 + }, + { + "completion_length": 2474.5001220703125, + "epoch": 0.39390243902439026, + "grad_norm": 0.07839435134974657, + "kl": 0.0650634765625, + "learning_rate": 2.2774774130697184e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2584 + }, + { + "completion_length": 1152.6666870117188, + "epoch": 0.3940548780487805, + "grad_norm": 0.11953433452854442, + "kl": 0.10009765625, + "learning_rate": 2.2767947187607547e-06, + "loss": 0.004, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2585 + }, + { + "completion_length": 3667.5001220703125, + "epoch": 0.39420731707317075, + "grad_norm": 0.0347902325549992, + "kl": 0.0477294921875, + "learning_rate": 2.2761118045071916e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2586 + }, + { + "completion_length": 1804.5000610351562, + "epoch": 0.39435975609756097, + "grad_norm": 0.09326208738271827, + "kl": 0.093505859375, + "learning_rate": 2.275428670502393e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2587 + }, + { + "completion_length": 1558.5000915527344, + "epoch": 0.39451219512195124, + "grad_norm": 0.383704890481939, + "kl": 0.0806884765625, + "learning_rate": 2.2747453169397835e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2588 + }, + { + "completion_length": 1235.8333740234375, + "epoch": 0.39466463414634145, + "grad_norm": 0.1417301401100907, + "kl": 0.102783203125, + "learning_rate": 2.27406174401285e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2589 + }, + { + "completion_length": 1749.3333740234375, + "epoch": 0.3948170731707317, + "grad_norm": 0.12263006868495066, + "kl": 0.082275390625, + "learning_rate": 2.273377951915143e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2590 + }, + { + "completion_length": 2026.0, + "epoch": 0.39496951219512194, + "grad_norm": 0.13186937446121322, + "kl": 0.0986328125, + "learning_rate": 2.2726939408402727e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2591 + }, + { + "completion_length": 950.3333740234375, + "epoch": 0.3951219512195122, + "grad_norm": 0.1293631531444406, + "kl": 0.083251953125, + "learning_rate": 2.2720097109819135e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2592 + }, + { + "completion_length": 2032.166748046875, + "epoch": 0.39527439024390243, + "grad_norm": 0.06085866066199294, + "kl": 0.0582275390625, + "learning_rate": 2.2713252625338007e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2593 + }, + { + "completion_length": 1535.666748046875, + "epoch": 0.3954268292682927, + "grad_norm": 0.1077558486500162, + "kl": 0.108642578125, + "learning_rate": 2.2706405956897316e-06, + "loss": 0.0043, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2594 + }, + { + "completion_length": 828.1666870117188, + "epoch": 0.3955792682926829, + "grad_norm": 0.10263559724148108, + "kl": 0.065185546875, + "learning_rate": 2.269955710643565e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2595 + }, + { + "completion_length": 2710.5001220703125, + "epoch": 0.3957317073170732, + "grad_norm": 0.0737168173007361, + "kl": 0.0716552734375, + "learning_rate": 2.269270607589222e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2596 + }, + { + "completion_length": 3304.0001220703125, + "epoch": 0.3958841463414634, + "grad_norm": 0.05805734208479495, + "kl": 0.0472412109375, + "learning_rate": 2.2685852867206857e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2597 + }, + { + "completion_length": 1800.5000610351562, + "epoch": 0.3960365853658537, + "grad_norm": 2.13425282738376, + "kl": 0.079833984375, + "learning_rate": 2.2678997482319996e-06, + "loss": 0.0032, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2598 + }, + { + "completion_length": 1405.6666870117188, + "epoch": 0.3961890243902439, + "grad_norm": 0.08951855300573004, + "kl": 0.093505859375, + "learning_rate": 2.26721399231727e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2599 + }, + { + "completion_length": 1002.0000610351562, + "epoch": 0.39634146341463417, + "grad_norm": 0.12255644454764199, + "kl": 0.1064453125, + "learning_rate": 2.2665280191706656e-06, + "loss": 0.0043, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2600 + }, + { + "completion_length": 968.8333740234375, + "epoch": 0.3964939024390244, + "grad_norm": 0.12403916769608946, + "kl": 0.09228515625, + "learning_rate": 2.2658418289864144e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2601 + }, + { + "completion_length": 1234.1666870117188, + "epoch": 0.39664634146341465, + "grad_norm": 0.09910202023460415, + "kl": 0.0791015625, + "learning_rate": 2.265155421958806e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2602 + }, + { + "completion_length": 1786.3333740234375, + "epoch": 0.39679878048780487, + "grad_norm": 0.09786497633016371, + "kl": 0.088623046875, + "learning_rate": 2.264468798282194e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2603 + }, + { + "completion_length": 2723.8333740234375, + "epoch": 0.39695121951219514, + "grad_norm": 0.10698369193984547, + "kl": 0.07861328125, + "learning_rate": 2.2637819581509906e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2604 + }, + { + "completion_length": 819.6666870117188, + "epoch": 0.39710365853658536, + "grad_norm": 0.14094399557666853, + "kl": 0.087158203125, + "learning_rate": 2.26309490175967e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2605 + }, + { + "completion_length": 2203.5000915527344, + "epoch": 0.39725609756097563, + "grad_norm": 0.09075087865922887, + "kl": 0.066650390625, + "learning_rate": 2.2624076293027696e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2606 + }, + { + "completion_length": 2498.0, + "epoch": 0.39740853658536585, + "grad_norm": 0.07782593878313618, + "kl": 0.0660400390625, + "learning_rate": 2.2617201409748846e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2607 + }, + { + "completion_length": 2137.6666870117188, + "epoch": 0.3975609756097561, + "grad_norm": 0.08405209852692341, + "kl": 0.068359375, + "learning_rate": 2.2610324369706735e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2608 + }, + { + "completion_length": 1145.5, + "epoch": 0.39771341463414633, + "grad_norm": 0.12951760952426067, + "kl": 0.08203125, + "learning_rate": 2.260344517484856e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2609 + }, + { + "completion_length": 1412.5, + "epoch": 0.3978658536585366, + "grad_norm": 0.12156475798502178, + "kl": 0.08203125, + "learning_rate": 2.2596563827122112e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2610 + }, + { + "completion_length": 818.6666870117188, + "epoch": 0.3980182926829268, + "grad_norm": 0.13815779126326813, + "kl": 0.08056640625, + "learning_rate": 2.258968032847582e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2611 + }, + { + "completion_length": 1875.0, + "epoch": 0.3981707317073171, + "grad_norm": 0.0904544585441667, + "kl": 0.06103515625, + "learning_rate": 2.258279468085868e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2612 + }, + { + "completion_length": 939.6666870117188, + "epoch": 0.3983231707317073, + "grad_norm": 0.14129884849412877, + "kl": 0.09228515625, + "learning_rate": 2.2575906886220338e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2613 + }, + { + "completion_length": 3149.5001220703125, + "epoch": 0.3984756097560976, + "grad_norm": 0.05586419413078634, + "kl": 0.0587158203125, + "learning_rate": 2.2569016946511027e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2614 + }, + { + "completion_length": 761.8333740234375, + "epoch": 0.3986280487804878, + "grad_norm": 0.14388269510678944, + "kl": 0.078369140625, + "learning_rate": 2.256212486368159e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2615 + }, + { + "completion_length": 1527.1666870117188, + "epoch": 0.39878048780487807, + "grad_norm": 0.16989456719478124, + "kl": 0.083984375, + "learning_rate": 2.2555230639683464e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2616 + }, + { + "completion_length": 1017.1666870117188, + "epoch": 0.3989329268292683, + "grad_norm": 0.09125307005517873, + "kl": 0.0540771484375, + "learning_rate": 2.2548334276468725e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2617 + }, + { + "completion_length": 979.1666870117188, + "epoch": 0.39908536585365856, + "grad_norm": 0.1283574314332788, + "kl": 0.08349609375, + "learning_rate": 2.254143577599003e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2618 + }, + { + "completion_length": 2738.8333740234375, + "epoch": 0.3992378048780488, + "grad_norm": 0.05822243077780506, + "kl": 0.0474853515625, + "learning_rate": 2.2534535140200643e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2619 + }, + { + "completion_length": 1570.8333740234375, + "epoch": 0.39939024390243905, + "grad_norm": 0.11397519216250979, + "kl": 0.0721435546875, + "learning_rate": 2.252763237105444e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2620 + }, + { + "completion_length": 1457.0000305175781, + "epoch": 0.39954268292682926, + "grad_norm": 0.31125344462834825, + "kl": 0.0753173828125, + "learning_rate": 2.252072747050589e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2621 + }, + { + "completion_length": 2459.666748046875, + "epoch": 0.39969512195121953, + "grad_norm": 1.0377705057422413, + "kl": 0.089599609375, + "learning_rate": 2.2513820440510077e-06, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2622 + }, + { + "completion_length": 1969.8333740234375, + "epoch": 0.39984756097560975, + "grad_norm": 0.07417178550603823, + "kl": 0.0689697265625, + "learning_rate": 2.2506911283022687e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2623 + }, + { + "completion_length": 2633.0001220703125, + "epoch": 0.4, + "grad_norm": 0.08603960618135732, + "kl": 0.059814453125, + "learning_rate": 2.25e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2624 + }, + { + "completion_length": 2539.0, + "epoch": 0.40015243902439024, + "grad_norm": 0.18203917675706296, + "kl": 0.0706787109375, + "learning_rate": 2.2493086593398906e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2625 + }, + { + "completion_length": 2248.0000610351562, + "epoch": 0.4003048780487805, + "grad_norm": 0.12429900838296057, + "kl": 0.068359375, + "learning_rate": 2.248617106517689e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2626 + }, + { + "completion_length": 2122.5001220703125, + "epoch": 0.4004573170731707, + "grad_norm": 0.0900354080250477, + "kl": 0.0777587890625, + "learning_rate": 2.2479253417292048e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2627 + }, + { + "completion_length": 2558.666748046875, + "epoch": 0.400609756097561, + "grad_norm": 0.14299297413117432, + "kl": 0.06494140625, + "learning_rate": 2.247233365170306e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2628 + }, + { + "completion_length": 2244.666748046875, + "epoch": 0.4007621951219512, + "grad_norm": 0.1378558520440724, + "kl": 0.0673828125, + "learning_rate": 2.246541177036922e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2629 + }, + { + "completion_length": 1730.8333740234375, + "epoch": 0.4009146341463415, + "grad_norm": 0.07147369586679327, + "kl": 0.068115234375, + "learning_rate": 2.2458487775250414e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2630 + }, + { + "completion_length": 1842.166748046875, + "epoch": 0.4010670731707317, + "grad_norm": 2.0309194627590728, + "kl": 0.0718994140625, + "learning_rate": 2.245156166830713e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2631 + }, + { + "completion_length": 2150.3334350585938, + "epoch": 0.401219512195122, + "grad_norm": 0.21665225109636296, + "kl": 0.0849609375, + "learning_rate": 2.2444633451500453e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2632 + }, + { + "completion_length": 4096.0, + "epoch": 0.4013719512195122, + "grad_norm": 0.04871287616666875, + "kl": 0.04052734375, + "learning_rate": 2.2437703126792055e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2633 + }, + { + "completion_length": 2049.166748046875, + "epoch": 0.40152439024390246, + "grad_norm": 0.08880843038537416, + "kl": 0.0556640625, + "learning_rate": 2.2430770696144223e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2634 + }, + { + "completion_length": 1872.0000610351562, + "epoch": 0.4016768292682927, + "grad_norm": 0.1406124368141756, + "kl": 0.079833984375, + "learning_rate": 2.2423836161519833e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2635 + }, + { + "completion_length": 1225.0000610351562, + "epoch": 0.40182926829268295, + "grad_norm": 0.11628231560435195, + "kl": 0.088134765625, + "learning_rate": 2.2416899524882353e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2636 + }, + { + "completion_length": 1615.3333740234375, + "epoch": 0.40198170731707317, + "grad_norm": 0.11613883330450575, + "kl": 0.078857421875, + "learning_rate": 2.240996078819585e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2637 + }, + { + "completion_length": 2124.0000610351562, + "epoch": 0.40213414634146344, + "grad_norm": 0.08679480022548024, + "kl": 0.0631103515625, + "learning_rate": 2.240301995342498e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2638 + }, + { + "completion_length": 2529.8333740234375, + "epoch": 0.40228658536585366, + "grad_norm": 0.07454631224212323, + "kl": 0.076416015625, + "learning_rate": 2.2396077022535e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2639 + }, + { + "completion_length": 1536.1667175292969, + "epoch": 0.4024390243902439, + "grad_norm": 0.1253231024489289, + "kl": 0.076904296875, + "learning_rate": 2.2389131997491756e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2640 + }, + { + "completion_length": 3687.0001220703125, + "epoch": 0.40259146341463414, + "grad_norm": 0.05606122942103578, + "kl": 0.0439453125, + "learning_rate": 2.2382184880261692e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2641 + }, + { + "completion_length": 1855.5000915527344, + "epoch": 0.4027439024390244, + "grad_norm": 0.0840527370296112, + "kl": 0.078857421875, + "learning_rate": 2.237523567281184e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2642 + }, + { + "completion_length": 2814.1666870117188, + "epoch": 0.40289634146341463, + "grad_norm": 0.13006324763917002, + "kl": 0.0592041015625, + "learning_rate": 2.2368284377109817e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2643 + }, + { + "completion_length": 1724.8334350585938, + "epoch": 0.4030487804878049, + "grad_norm": 0.1466871518462547, + "kl": 0.0810546875, + "learning_rate": 2.236133099512385e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2644 + }, + { + "completion_length": 2470.0000610351562, + "epoch": 0.4032012195121951, + "grad_norm": 1.524252167687931, + "kl": 0.059814453125, + "learning_rate": 2.2354375528822747e-06, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2645 + }, + { + "completion_length": 1938.666748046875, + "epoch": 0.4033536585365854, + "grad_norm": 0.15110657404183686, + "kl": 0.0648193359375, + "learning_rate": 2.2347417980175897e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2646 + }, + { + "completion_length": 1686.3334350585938, + "epoch": 0.4035060975609756, + "grad_norm": 0.10724388553905696, + "kl": 0.073974609375, + "learning_rate": 2.234045835115329e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2647 + }, + { + "completion_length": 1176.0, + "epoch": 0.4036585365853659, + "grad_norm": 0.2295431849184843, + "kl": 0.087890625, + "learning_rate": 2.2333496643725505e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2648 + }, + { + "completion_length": 3317.0001220703125, + "epoch": 0.4038109756097561, + "grad_norm": 0.06117117803107757, + "kl": 0.0452880859375, + "learning_rate": 2.23265328598637e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2649 + }, + { + "completion_length": 1628.1666870117188, + "epoch": 0.40396341463414637, + "grad_norm": 0.10863269008507288, + "kl": 0.084228515625, + "learning_rate": 2.2319567001539635e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2650 + }, + { + "completion_length": 1177.5000305175781, + "epoch": 0.4041158536585366, + "grad_norm": 0.25821622745367545, + "kl": 0.09228515625, + "learning_rate": 2.2312599070725633e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2651 + }, + { + "completion_length": 1201.5000305175781, + "epoch": 0.40426829268292686, + "grad_norm": 0.2709896229188441, + "kl": 0.102783203125, + "learning_rate": 2.230562906939464e-06, + "loss": 0.0041, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2652 + }, + { + "completion_length": 1456.8333740234375, + "epoch": 0.40442073170731707, + "grad_norm": 0.18301997881969786, + "kl": 0.096435546875, + "learning_rate": 2.229865699952016e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2653 + }, + { + "completion_length": 2507.666748046875, + "epoch": 0.40457317073170734, + "grad_norm": 0.10517889818279183, + "kl": 0.0830078125, + "learning_rate": 2.229168286307629e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2654 + }, + { + "completion_length": 920.1667175292969, + "epoch": 0.40472560975609756, + "grad_norm": 0.12389534820741058, + "kl": 0.075439453125, + "learning_rate": 2.2284706662037716e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2655 + }, + { + "completion_length": 2335.0, + "epoch": 0.40487804878048783, + "grad_norm": 0.08228679033435293, + "kl": 0.059814453125, + "learning_rate": 2.2277728398379705e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2656 + }, + { + "completion_length": 1522.6666870117188, + "epoch": 0.40503048780487805, + "grad_norm": 0.12346493725747842, + "kl": 0.111083984375, + "learning_rate": 2.2270748074078112e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2657 + }, + { + "completion_length": 1505.5, + "epoch": 0.4051829268292683, + "grad_norm": 0.14095942414575244, + "kl": 0.0718994140625, + "learning_rate": 2.2263765691109368e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2658 + }, + { + "completion_length": 2896.0001220703125, + "epoch": 0.40533536585365854, + "grad_norm": 0.2803034727506776, + "kl": 0.0684814453125, + "learning_rate": 2.225678125145049e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2659 + }, + { + "completion_length": 2238.166748046875, + "epoch": 0.4054878048780488, + "grad_norm": 0.14599784108068575, + "kl": 0.068359375, + "learning_rate": 2.2249794757079083e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2660 + }, + { + "completion_length": 2911.5, + "epoch": 0.405640243902439, + "grad_norm": 0.10001083196143079, + "kl": 0.0582275390625, + "learning_rate": 2.224280620997333e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2661 + }, + { + "completion_length": 2329.6666870117188, + "epoch": 0.4057926829268293, + "grad_norm": 0.12087278589061885, + "kl": 0.08349609375, + "learning_rate": 2.2235815612111993e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2662 + }, + { + "completion_length": 1398.3333740234375, + "epoch": 0.4059451219512195, + "grad_norm": 0.07650209987857129, + "kl": 0.0528564453125, + "learning_rate": 2.2228822965474413e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2663 + }, + { + "completion_length": 2175.5000610351562, + "epoch": 0.4060975609756098, + "grad_norm": 0.1120920198912142, + "kl": 0.078125, + "learning_rate": 2.2221828272040517e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2664 + }, + { + "completion_length": 2519.3333740234375, + "epoch": 0.40625, + "grad_norm": 0.07333774441381005, + "kl": 0.0660400390625, + "learning_rate": 2.2214831533790816e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2665 + }, + { + "completion_length": 3672.8333740234375, + "epoch": 0.4064024390243902, + "grad_norm": 0.056844509653414275, + "kl": 0.045654296875, + "learning_rate": 2.220783275270638e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2666 + }, + { + "completion_length": 1442.0000610351562, + "epoch": 0.4065548780487805, + "grad_norm": 0.1473227407007454, + "kl": 0.085205078125, + "learning_rate": 2.220083193076888e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2667 + }, + { + "completion_length": 1719.8333740234375, + "epoch": 0.4067073170731707, + "grad_norm": 0.08001227820484395, + "kl": 0.07421875, + "learning_rate": 2.2193829069960556e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2668 + }, + { + "completion_length": 2173.5001220703125, + "epoch": 0.406859756097561, + "grad_norm": 0.10091871905197895, + "kl": 0.073486328125, + "learning_rate": 2.2186824172264217e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2669 + }, + { + "completion_length": 1192.8333740234375, + "epoch": 0.4070121951219512, + "grad_norm": 0.18098444269094374, + "kl": 0.0892333984375, + "learning_rate": 2.217981723966326e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2670 + }, + { + "completion_length": 890.3333435058594, + "epoch": 0.40716463414634146, + "grad_norm": 0.1331600536971374, + "kl": 0.0732421875, + "learning_rate": 2.217280827414165e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2671 + }, + { + "completion_length": 825.6666870117188, + "epoch": 0.4073170731707317, + "grad_norm": 0.11021306785054268, + "kl": 0.0771484375, + "learning_rate": 2.2165797277683943e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2672 + }, + { + "completion_length": 1319.5000610351562, + "epoch": 0.40746951219512195, + "grad_norm": 0.11258345833501053, + "kl": 0.076904296875, + "learning_rate": 2.215878425227525e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2673 + }, + { + "completion_length": 3210.8333740234375, + "epoch": 0.40762195121951217, + "grad_norm": 0.05425977444252162, + "kl": 0.0606689453125, + "learning_rate": 2.215176919990127e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2674 + }, + { + "completion_length": 754.5000305175781, + "epoch": 0.40777439024390244, + "grad_norm": 0.09379312297090336, + "kl": 0.051513671875, + "learning_rate": 2.2144752122548273e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2675 + }, + { + "completion_length": 709.3333587646484, + "epoch": 0.40792682926829266, + "grad_norm": 3.909757193043969, + "kl": 0.082275390625, + "learning_rate": 2.213773302220309e-06, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2676 + }, + { + "completion_length": 1454.3333740234375, + "epoch": 0.40807926829268293, + "grad_norm": 0.06604639842531423, + "kl": 0.0462646484375, + "learning_rate": 2.2130711900853148e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2677 + }, + { + "completion_length": 1997.3333740234375, + "epoch": 0.40823170731707314, + "grad_norm": 0.08499818341883218, + "kl": 0.0595703125, + "learning_rate": 2.2123688760486425e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2678 + }, + { + "completion_length": 1221.666748046875, + "epoch": 0.4083841463414634, + "grad_norm": 0.1114538604838664, + "kl": 0.099609375, + "learning_rate": 2.211666360309149e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2679 + }, + { + "completion_length": 740.3333435058594, + "epoch": 0.40853658536585363, + "grad_norm": 0.3544922256390372, + "kl": 0.074951171875, + "learning_rate": 2.2109636430657463e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2680 + }, + { + "completion_length": 475.1666717529297, + "epoch": 0.4086890243902439, + "grad_norm": 0.16537718652768096, + "kl": 0.07861328125, + "learning_rate": 2.2102607245174046e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2681 + }, + { + "completion_length": 1950.666748046875, + "epoch": 0.4088414634146341, + "grad_norm": 0.11515088861328415, + "kl": 0.079345703125, + "learning_rate": 2.209557604863151e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2682 + }, + { + "completion_length": 916.1666870117188, + "epoch": 0.4089939024390244, + "grad_norm": 0.11032712670398595, + "kl": 0.085205078125, + "learning_rate": 2.2088542843020696e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2683 + }, + { + "completion_length": 1804.666748046875, + "epoch": 0.4091463414634146, + "grad_norm": 0.2443816321597142, + "kl": 0.08544921875, + "learning_rate": 2.2081507630333016e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2684 + }, + { + "completion_length": 1388.666748046875, + "epoch": 0.4092987804878049, + "grad_norm": 0.21246319499136737, + "kl": 0.0673828125, + "learning_rate": 2.2074470412560438e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2685 + }, + { + "completion_length": 1212.0000610351562, + "epoch": 0.4094512195121951, + "grad_norm": 0.1391565964098722, + "kl": 0.076171875, + "learning_rate": 2.2067431191695517e-06, + "loss": 0.003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2686 + }, + { + "completion_length": 1648.666748046875, + "epoch": 0.40960365853658537, + "grad_norm": 0.1049089991283803, + "kl": 0.07568359375, + "learning_rate": 2.2060389969731355e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2687 + }, + { + "completion_length": 2169.666748046875, + "epoch": 0.4097560975609756, + "grad_norm": 0.06021329814539154, + "kl": 0.046630859375, + "learning_rate": 2.2053346748661633e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2688 + }, + { + "completion_length": 1338.1666870117188, + "epoch": 0.40990853658536586, + "grad_norm": 0.2186596383040387, + "kl": 0.102783203125, + "learning_rate": 2.20463015304806e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2689 + }, + { + "completion_length": 2331.1666870117188, + "epoch": 0.4100609756097561, + "grad_norm": 0.09755791636496254, + "kl": 0.0592041015625, + "learning_rate": 2.2039254317183058e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2690 + }, + { + "completion_length": 885.0000457763672, + "epoch": 0.41021341463414634, + "grad_norm": 0.12015404675333224, + "kl": 0.08203125, + "learning_rate": 2.2032205110764387e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2691 + }, + { + "completion_length": 1358.1666870117188, + "epoch": 0.41036585365853656, + "grad_norm": 0.06538899077871235, + "kl": 0.040771484375, + "learning_rate": 2.202515391322052e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2692 + }, + { + "completion_length": 1398.5, + "epoch": 0.41051829268292683, + "grad_norm": 0.12472476639537561, + "kl": 0.07568359375, + "learning_rate": 2.2018100726547975e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2693 + }, + { + "completion_length": 2252.3333740234375, + "epoch": 0.41067073170731705, + "grad_norm": 0.2081670960110352, + "kl": 0.11328125, + "learning_rate": 2.2011045552743803e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2694 + }, + { + "completion_length": 745.0000305175781, + "epoch": 0.4108231707317073, + "grad_norm": 2.0790304342403743, + "kl": 0.099853515625, + "learning_rate": 2.2003988393805637e-06, + "loss": 0.004, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 2695 + }, + { + "completion_length": 2028.5001220703125, + "epoch": 0.41097560975609754, + "grad_norm": 1.0980913064548006, + "kl": 0.064453125, + "learning_rate": 2.1996929251731665e-06, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2696 + }, + { + "completion_length": 1728.0000610351562, + "epoch": 0.4111280487804878, + "grad_norm": 0.1089448262157029, + "kl": 0.061767578125, + "learning_rate": 2.198986812852065e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2697 + }, + { + "completion_length": 2149.8333740234375, + "epoch": 0.411280487804878, + "grad_norm": 0.0694551491379376, + "kl": 0.0631103515625, + "learning_rate": 2.198280502617189e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2698 + }, + { + "completion_length": 1413.8333740234375, + "epoch": 0.4114329268292683, + "grad_norm": 1.5005711649452431, + "kl": 0.0712890625, + "learning_rate": 2.1975739946685265e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2699 + }, + { + "completion_length": 1140.0000305175781, + "epoch": 0.4115853658536585, + "grad_norm": 0.11276271705834243, + "kl": 0.086181640625, + "learning_rate": 2.196867289206121e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2700 + }, + { + "completion_length": 1328.6666870117188, + "epoch": 0.4117378048780488, + "grad_norm": 0.13071737262516198, + "kl": 0.0947265625, + "learning_rate": 2.196160386430072e-06, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2701 + }, + { + "completion_length": 774.1666870117188, + "epoch": 0.411890243902439, + "grad_norm": 0.13838014223288314, + "kl": 0.0712890625, + "learning_rate": 2.1954532865405342e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2702 + }, + { + "completion_length": 1276.8333740234375, + "epoch": 0.4120426829268293, + "grad_norm": 0.36808492148683924, + "kl": 0.103271484375, + "learning_rate": 2.1947459897377185e-06, + "loss": 0.0041, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2703 + }, + { + "completion_length": 1896.3333740234375, + "epoch": 0.4121951219512195, + "grad_norm": 0.13351550363921474, + "kl": 0.095703125, + "learning_rate": 2.194038496221892e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2704 + }, + { + "completion_length": 1648.3333740234375, + "epoch": 0.41234756097560976, + "grad_norm": 0.13064692098304595, + "kl": 0.093017578125, + "learning_rate": 2.1933308061933767e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2705 + }, + { + "completion_length": 994.1666870117188, + "epoch": 0.4125, + "grad_norm": 0.1378923629434708, + "kl": 0.075439453125, + "learning_rate": 2.192622919852551e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2706 + }, + { + "completion_length": 892.6666870117188, + "epoch": 0.41265243902439025, + "grad_norm": 0.5121493978798215, + "kl": 0.088134765625, + "learning_rate": 2.1919148373998483e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2707 + }, + { + "completion_length": 1168.5000305175781, + "epoch": 0.41280487804878047, + "grad_norm": 0.19065771392719993, + "kl": 0.0855712890625, + "learning_rate": 2.1912065590357576e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2708 + }, + { + "completion_length": 1417.0, + "epoch": 0.41295731707317074, + "grad_norm": 0.13430863004657112, + "kl": 0.078857421875, + "learning_rate": 2.1904980849608232e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2709 + }, + { + "completion_length": 1090.5000610351562, + "epoch": 0.41310975609756095, + "grad_norm": 0.18480113045461988, + "kl": 0.094482421875, + "learning_rate": 2.1897894153756464e-06, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2710 + }, + { + "completion_length": 1097.3333740234375, + "epoch": 0.4132621951219512, + "grad_norm": 0.16288376840352706, + "kl": 0.0927734375, + "learning_rate": 2.1890805504808812e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2711 + }, + { + "completion_length": 721.0000305175781, + "epoch": 0.41341463414634144, + "grad_norm": 0.1230898157602718, + "kl": 0.08935546875, + "learning_rate": 2.188371490477239e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2712 + }, + { + "completion_length": 1394.0, + "epoch": 0.4135670731707317, + "grad_norm": 0.09107530741238475, + "kl": 0.078369140625, + "learning_rate": 2.187662235565486e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2713 + }, + { + "completion_length": 1007.0000610351562, + "epoch": 0.41371951219512193, + "grad_norm": 0.09612245232746293, + "kl": 0.0654296875, + "learning_rate": 2.1869527859464426e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2714 + }, + { + "completion_length": 1068.5000305175781, + "epoch": 0.4138719512195122, + "grad_norm": 2.737653357296449, + "kl": 0.1005859375, + "learning_rate": 2.1862431418209854e-06, + "loss": 0.004, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 2715 + }, + { + "completion_length": 1198.6666870117188, + "epoch": 0.4140243902439024, + "grad_norm": 0.10074574958099959, + "kl": 0.08544921875, + "learning_rate": 2.185533303390046e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2716 + }, + { + "completion_length": 1317.3333740234375, + "epoch": 0.4141768292682927, + "grad_norm": 1.388270211607343, + "kl": 0.076416015625, + "learning_rate": 2.18482327085461e-06, + "loss": 0.0031, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2717 + }, + { + "completion_length": 951.5000305175781, + "epoch": 0.4143292682926829, + "grad_norm": 0.11003894122701861, + "kl": 0.09423828125, + "learning_rate": 2.1841130444157194e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2718 + }, + { + "completion_length": 1150.1666870117188, + "epoch": 0.4144817073170732, + "grad_norm": 0.10053068587471356, + "kl": 0.0986328125, + "learning_rate": 2.18340262427447e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2719 + }, + { + "completion_length": 856.6666870117188, + "epoch": 0.4146341463414634, + "grad_norm": 0.15501917714012467, + "kl": 0.11962890625, + "learning_rate": 2.182692010632013e-06, + "loss": 0.0048, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2720 + }, + { + "completion_length": 870.6666870117188, + "epoch": 0.41478658536585367, + "grad_norm": 0.13008732828701022, + "kl": 0.100830078125, + "learning_rate": 2.1819812036895544e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2721 + }, + { + "completion_length": 1614.0000610351562, + "epoch": 0.4149390243902439, + "grad_norm": 0.11276885731042756, + "kl": 0.106201171875, + "learning_rate": 2.181270203648355e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2722 + }, + { + "completion_length": 526.5000152587891, + "epoch": 0.41509146341463415, + "grad_norm": 0.16722876278259932, + "kl": 0.098876953125, + "learning_rate": 2.180559010709729e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2723 + }, + { + "completion_length": 982.1666870117188, + "epoch": 0.41524390243902437, + "grad_norm": 0.11494781303948681, + "kl": 0.10498046875, + "learning_rate": 2.1798476250750473e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2724 + }, + { + "completion_length": 1495.3333740234375, + "epoch": 0.41539634146341464, + "grad_norm": 0.09975637378650684, + "kl": 0.103271484375, + "learning_rate": 2.179136046945734e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2725 + }, + { + "completion_length": 1211.5, + "epoch": 0.41554878048780486, + "grad_norm": 1.7535580977311285, + "kl": 0.1044921875, + "learning_rate": 2.1784242765232683e-06, + "loss": 0.0042, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2726 + }, + { + "completion_length": 1581.1666870117188, + "epoch": 0.41570121951219513, + "grad_norm": 0.11555676191204507, + "kl": 0.0927734375, + "learning_rate": 2.1777123140091826e-06, + "loss": 0.0037, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2727 + }, + { + "completion_length": 1658.5000610351562, + "epoch": 0.41585365853658535, + "grad_norm": 0.1490623592959376, + "kl": 0.08837890625, + "learning_rate": 2.177000159605065e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2728 + }, + { + "completion_length": 1489.8333740234375, + "epoch": 0.4160060975609756, + "grad_norm": 0.1457367720793222, + "kl": 0.1025390625, + "learning_rate": 2.1762878135125587e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2729 + }, + { + "completion_length": 2363.3333740234375, + "epoch": 0.41615853658536583, + "grad_norm": 0.14276649955246015, + "kl": 0.09814453125, + "learning_rate": 2.175575275933359e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2730 + }, + { + "completion_length": 1880.5000610351562, + "epoch": 0.4163109756097561, + "grad_norm": 0.10481973564205949, + "kl": 0.08935546875, + "learning_rate": 2.174862547069217e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2731 + }, + { + "completion_length": 1003.1666870117188, + "epoch": 0.4164634146341463, + "grad_norm": 0.1366009871063498, + "kl": 0.09521484375, + "learning_rate": 2.174149627121937e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2732 + }, + { + "completion_length": 2784.166748046875, + "epoch": 0.4166158536585366, + "grad_norm": 0.09247484498597924, + "kl": 0.07421875, + "learning_rate": 2.173436516293378e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2733 + }, + { + "completion_length": 2974.5001220703125, + "epoch": 0.4167682926829268, + "grad_norm": 1.2251934590558877, + "kl": 0.0548095703125, + "learning_rate": 2.1727232147854533e-06, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2734 + }, + { + "completion_length": 1543.5, + "epoch": 0.4169207317073171, + "grad_norm": 0.19512596023326148, + "kl": 0.097900390625, + "learning_rate": 2.1720097228001294e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2735 + }, + { + "completion_length": 1550.8333740234375, + "epoch": 0.4170731707317073, + "grad_norm": 0.09699801135274141, + "kl": 0.0859375, + "learning_rate": 2.1712960405394265e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2736 + }, + { + "completion_length": 1975.6666870117188, + "epoch": 0.41722560975609757, + "grad_norm": 0.4831181043916624, + "kl": 0.082763671875, + "learning_rate": 2.1705821682054204e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2737 + }, + { + "completion_length": 2262.166717529297, + "epoch": 0.4173780487804878, + "grad_norm": 0.08337450188110962, + "kl": 0.07177734375, + "learning_rate": 2.1698681060002396e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2738 + }, + { + "completion_length": 896.3333740234375, + "epoch": 0.41753048780487806, + "grad_norm": 0.11423717686795186, + "kl": 0.082763671875, + "learning_rate": 2.1691538541260656e-06, + "loss": 0.0033, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2739 + }, + { + "completion_length": 1213.8333435058594, + "epoch": 0.4176829268292683, + "grad_norm": 0.21978409630082651, + "kl": 0.0911865234375, + "learning_rate": 2.168439412785135e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2740 + }, + { + "completion_length": 3390.166748046875, + "epoch": 0.41783536585365855, + "grad_norm": 0.10049153520017491, + "kl": 0.0723876953125, + "learning_rate": 2.167724782179737e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2741 + }, + { + "completion_length": 3574.0, + "epoch": 0.41798780487804876, + "grad_norm": 0.07510569504622568, + "kl": 0.06103515625, + "learning_rate": 2.167009962512215e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2742 + }, + { + "completion_length": 1456.1666870117188, + "epoch": 0.41814024390243903, + "grad_norm": 0.10294871994343686, + "kl": 0.07470703125, + "learning_rate": 2.1662949539849656e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2743 + }, + { + "completion_length": 2261.0000610351562, + "epoch": 0.41829268292682925, + "grad_norm": 0.09229906018481596, + "kl": 0.089599609375, + "learning_rate": 2.1655797568004397e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2744 + }, + { + "completion_length": 1268.6666870117188, + "epoch": 0.4184451219512195, + "grad_norm": 2.003163835458366, + "kl": 0.083984375, + "learning_rate": 2.1648643711611402e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2745 + }, + { + "completion_length": 2624.0000610351562, + "epoch": 0.41859756097560974, + "grad_norm": 0.10869222286066127, + "kl": 0.0634765625, + "learning_rate": 2.1641487972696238e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2746 + }, + { + "completion_length": 2594.3333740234375, + "epoch": 0.41875, + "grad_norm": 0.09825549023927661, + "kl": 0.07568359375, + "learning_rate": 2.163433035328502e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2747 + }, + { + "completion_length": 873.1666870117188, + "epoch": 0.4189024390243902, + "grad_norm": 0.1242955191823682, + "kl": 0.09423828125, + "learning_rate": 2.1627170855404376e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2748 + }, + { + "completion_length": 2142.3333740234375, + "epoch": 0.4190548780487805, + "grad_norm": 0.20009573740852846, + "kl": 0.091552734375, + "learning_rate": 2.162000948108147e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2749 + }, + { + "completion_length": 1095.1666870117188, + "epoch": 0.4192073170731707, + "grad_norm": 0.154052073342919, + "kl": 0.110595703125, + "learning_rate": 2.161284623234401e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2750 + }, + { + "completion_length": 2009.3333740234375, + "epoch": 0.419359756097561, + "grad_norm": 0.11358539042231737, + "kl": 0.0849609375, + "learning_rate": 2.1605681111220216e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2751 + }, + { + "completion_length": 1328.1667175292969, + "epoch": 0.4195121951219512, + "grad_norm": 0.11470117411234564, + "kl": 0.10693359375, + "learning_rate": 2.1598514119738853e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2752 + }, + { + "completion_length": 912.1666870117188, + "epoch": 0.4196646341463415, + "grad_norm": 0.18461593671163964, + "kl": 0.08544921875, + "learning_rate": 2.159134525992921e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2753 + }, + { + "completion_length": 1736.8333740234375, + "epoch": 0.4198170731707317, + "grad_norm": 0.439161823606323, + "kl": 0.103271484375, + "learning_rate": 2.1584174533821097e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2754 + }, + { + "completion_length": 843.0000457763672, + "epoch": 0.41996951219512196, + "grad_norm": 0.1767092287529724, + "kl": 0.0751953125, + "learning_rate": 2.157700194344487e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2755 + }, + { + "completion_length": 2163.166748046875, + "epoch": 0.4201219512195122, + "grad_norm": 0.0703078698550117, + "kl": 0.081787109375, + "learning_rate": 2.1569827490831408e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2756 + }, + { + "completion_length": 866.8333740234375, + "epoch": 0.42027439024390245, + "grad_norm": 0.10692920044237511, + "kl": 0.0521240234375, + "learning_rate": 2.1562651178012097e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2757 + }, + { + "completion_length": 1637.5, + "epoch": 0.42042682926829267, + "grad_norm": 0.09264708497122393, + "kl": 0.100341796875, + "learning_rate": 2.155547300701888e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2758 + }, + { + "completion_length": 1953.8333740234375, + "epoch": 0.42057926829268294, + "grad_norm": 0.07994599117881826, + "kl": 0.078369140625, + "learning_rate": 2.15482929798842e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2759 + }, + { + "completion_length": 801.6666870117188, + "epoch": 0.42073170731707316, + "grad_norm": 0.10840983057311467, + "kl": 0.064453125, + "learning_rate": 2.154111109864105e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2760 + }, + { + "completion_length": 2027.5, + "epoch": 0.4208841463414634, + "grad_norm": 0.15558485309971173, + "kl": 0.091064453125, + "learning_rate": 2.153392736532292e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2761 + }, + { + "completion_length": 1338.8333740234375, + "epoch": 0.42103658536585364, + "grad_norm": 0.08019496802327347, + "kl": 0.074462890625, + "learning_rate": 2.1526741781963853e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2762 + }, + { + "completion_length": 1100.3333740234375, + "epoch": 0.4211890243902439, + "grad_norm": 2.305252026988006, + "kl": 0.092041015625, + "learning_rate": 2.151955435059839e-06, + "loss": 0.0037, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2763 + }, + { + "completion_length": 1252.0000610351562, + "epoch": 0.42134146341463413, + "grad_norm": 0.12797270151576984, + "kl": 0.08544921875, + "learning_rate": 2.1512365073261617e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2764 + }, + { + "completion_length": 1849.166748046875, + "epoch": 0.4214939024390244, + "grad_norm": 0.0867653355272679, + "kl": 0.0799560546875, + "learning_rate": 2.150517395198913e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2765 + }, + { + "completion_length": 2201.0001220703125, + "epoch": 0.4216463414634146, + "grad_norm": 0.06671606385432395, + "kl": 0.068359375, + "learning_rate": 2.149798098881705e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2766 + }, + { + "completion_length": 1518.5000610351562, + "epoch": 0.4217987804878049, + "grad_norm": 0.11278284435096818, + "kl": 0.078125, + "learning_rate": 2.149078618578202e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2767 + }, + { + "completion_length": 2026.5, + "epoch": 0.4219512195121951, + "grad_norm": 0.3449085964216569, + "kl": 0.081298828125, + "learning_rate": 2.1483589544921202e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2768 + }, + { + "completion_length": 2934.166748046875, + "epoch": 0.4221036585365854, + "grad_norm": 0.05921829918094006, + "kl": 0.065185546875, + "learning_rate": 2.147639106827229e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2769 + }, + { + "completion_length": 885.1666870117188, + "epoch": 0.4222560975609756, + "grad_norm": 0.15201220642880525, + "kl": 0.0888671875, + "learning_rate": 2.146919075787347e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2770 + }, + { + "completion_length": 2223.166748046875, + "epoch": 0.42240853658536587, + "grad_norm": 0.11804836247007626, + "kl": 0.070068359375, + "learning_rate": 2.1461988615763486e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2771 + }, + { + "completion_length": 1951.3334350585938, + "epoch": 0.4225609756097561, + "grad_norm": 0.08476460707558643, + "kl": 0.07861328125, + "learning_rate": 2.145478464398156e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2772 + }, + { + "completion_length": 3315.3333740234375, + "epoch": 0.42271341463414636, + "grad_norm": 0.06488378750265024, + "kl": 0.0673828125, + "learning_rate": 2.1447578844567467e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2773 + }, + { + "completion_length": 3733.166748046875, + "epoch": 0.4228658536585366, + "grad_norm": 0.04497517518209749, + "kl": 0.0626220703125, + "learning_rate": 2.144037121956147e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2774 + }, + { + "completion_length": 2498.8333435058594, + "epoch": 0.42301829268292684, + "grad_norm": 0.0849572615176027, + "kl": 0.0633544921875, + "learning_rate": 2.1433161771004378e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2775 + }, + { + "completion_length": 1971.6666870117188, + "epoch": 0.42317073170731706, + "grad_norm": 0.08550127607087767, + "kl": 0.0557861328125, + "learning_rate": 2.1425950500937493e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2776 + }, + { + "completion_length": 1099.0000305175781, + "epoch": 0.42332317073170733, + "grad_norm": 1.7976447817463397, + "kl": 0.05572509765625, + "learning_rate": 2.1418737411402645e-06, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2777 + }, + { + "completion_length": 825.8333740234375, + "epoch": 0.42347560975609755, + "grad_norm": 0.077101137638774, + "kl": 0.0462646484375, + "learning_rate": 2.1411522504442173e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2778 + }, + { + "completion_length": 1823.166748046875, + "epoch": 0.4236280487804878, + "grad_norm": 0.0888894380293043, + "kl": 0.063232421875, + "learning_rate": 2.140430578209893e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2779 + }, + { + "completion_length": 1539.1667175292969, + "epoch": 0.42378048780487804, + "grad_norm": 0.12803494863873657, + "kl": 0.065185546875, + "learning_rate": 2.13970872464163e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2780 + }, + { + "completion_length": 932.6666870117188, + "epoch": 0.4239329268292683, + "grad_norm": 1.4893583101429941, + "kl": 0.0888671875, + "learning_rate": 2.1389866899438146e-06, + "loss": 0.0036, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 2781 + }, + { + "completion_length": 2103.8333740234375, + "epoch": 0.4240853658536585, + "grad_norm": 0.14412145778166696, + "kl": 0.077392578125, + "learning_rate": 2.1382644743208883e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2782 + }, + { + "completion_length": 1068.3333740234375, + "epoch": 0.4242378048780488, + "grad_norm": 0.12416670907602094, + "kl": 0.066162109375, + "learning_rate": 2.1375420779773404e-06, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 2783 + }, + { + "completion_length": 1695.0000610351562, + "epoch": 0.424390243902439, + "grad_norm": 0.16098225293668011, + "kl": 0.08935546875, + "learning_rate": 2.1368195011177142e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2784 + }, + { + "completion_length": 1905.3333740234375, + "epoch": 0.4245426829268293, + "grad_norm": 0.12654047176205024, + "kl": 0.095947265625, + "learning_rate": 2.136096743946602e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2785 + }, + { + "completion_length": 1367.8333740234375, + "epoch": 0.4246951219512195, + "grad_norm": 1.69516974121409, + "kl": 0.100830078125, + "learning_rate": 2.1353738066686486e-06, + "loss": 0.004, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2786 + }, + { + "completion_length": 1146.3333740234375, + "epoch": 0.4248475609756098, + "grad_norm": 1.6859357435081574, + "kl": 0.1025390625, + "learning_rate": 2.134650689488549e-06, + "loss": 0.0041, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2787 + }, + { + "completion_length": 1790.5000610351562, + "epoch": 0.425, + "grad_norm": 0.1412699788429517, + "kl": 0.098388671875, + "learning_rate": 2.1339273926110494e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2788 + }, + { + "completion_length": 1772.666748046875, + "epoch": 0.42515243902439026, + "grad_norm": 0.14090470648887027, + "kl": 0.095703125, + "learning_rate": 2.1332039162409466e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2789 + }, + { + "completion_length": 1186.5000610351562, + "epoch": 0.4253048780487805, + "grad_norm": 0.0976956102199518, + "kl": 0.070068359375, + "learning_rate": 2.1324802605830885e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2790 + }, + { + "completion_length": 1539.1667175292969, + "epoch": 0.42545731707317075, + "grad_norm": 0.13238988192783946, + "kl": 0.1103515625, + "learning_rate": 2.1317564258423737e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2791 + }, + { + "completion_length": 1953.3334350585938, + "epoch": 0.42560975609756097, + "grad_norm": 0.15517386634981406, + "kl": 0.0706787109375, + "learning_rate": 2.1310324122237512e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2792 + }, + { + "completion_length": 885.6666870117188, + "epoch": 0.42576219512195124, + "grad_norm": 0.15871423673052623, + "kl": 0.07666015625, + "learning_rate": 2.1303082199322216e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2793 + }, + { + "completion_length": 2451.5001220703125, + "epoch": 0.42591463414634145, + "grad_norm": 1.778738842310104, + "kl": 0.0693359375, + "learning_rate": 2.1295838491728355e-06, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2794 + }, + { + "completion_length": 882.6666870117188, + "epoch": 0.4260670731707317, + "grad_norm": 0.14434192450670505, + "kl": 0.0975341796875, + "learning_rate": 2.1288593001506933e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2795 + }, + { + "completion_length": 2613.33349609375, + "epoch": 0.42621951219512194, + "grad_norm": 0.0920307202173818, + "kl": 0.067138671875, + "learning_rate": 2.128134573070947e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2796 + }, + { + "completion_length": 1388.166748046875, + "epoch": 0.4263719512195122, + "grad_norm": 0.10434203322244145, + "kl": 0.0849609375, + "learning_rate": 2.1274096681387983e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2797 + }, + { + "completion_length": 1563.1666870117188, + "epoch": 0.42652439024390243, + "grad_norm": 0.26481772322123676, + "kl": 0.091796875, + "learning_rate": 2.1266845855595e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2798 + }, + { + "completion_length": 1538.3334350585938, + "epoch": 0.4266768292682927, + "grad_norm": 0.12850230575583943, + "kl": 0.120849609375, + "learning_rate": 2.1259593255383544e-06, + "loss": 0.0048, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2799 + }, + { + "completion_length": 1160.6667175292969, + "epoch": 0.4268292682926829, + "grad_norm": 0.1718089045964096, + "kl": 0.1083984375, + "learning_rate": 2.125233888280715e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2800 + }, + { + "completion_length": 2012.166748046875, + "epoch": 0.4269817073170732, + "grad_norm": 0.0950529115254113, + "kl": 0.06396484375, + "learning_rate": 2.1245082739919827e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2801 + }, + { + "completion_length": 802.1666870117188, + "epoch": 0.4271341463414634, + "grad_norm": 0.13548602511245267, + "kl": 0.090576171875, + "learning_rate": 2.1237824828776135e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2802 + }, + { + "completion_length": 1060.3333740234375, + "epoch": 0.4272865853658537, + "grad_norm": 0.13480759069562798, + "kl": 0.11572265625, + "learning_rate": 2.1230565151431087e-06, + "loss": 0.0046, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2803 + }, + { + "completion_length": 1512.0, + "epoch": 0.4274390243902439, + "grad_norm": 0.08419376040953397, + "kl": 0.087158203125, + "learning_rate": 2.1223303709940226e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2804 + }, + { + "completion_length": 1661.6667175292969, + "epoch": 0.42759146341463417, + "grad_norm": 0.08929293562664879, + "kl": 0.07080078125, + "learning_rate": 2.1216040506359577e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2805 + }, + { + "completion_length": 890.1666870117188, + "epoch": 0.4277439024390244, + "grad_norm": 0.13193263081384043, + "kl": 0.08447265625, + "learning_rate": 2.120877554274568e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2806 + }, + { + "completion_length": 2108.166748046875, + "epoch": 0.42789634146341465, + "grad_norm": 0.08543882929678122, + "kl": 0.076416015625, + "learning_rate": 2.120150882115555e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2807 + }, + { + "completion_length": 826.8333435058594, + "epoch": 0.42804878048780487, + "grad_norm": 0.09948852957273555, + "kl": 0.07861328125, + "learning_rate": 2.1194240343646732e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2808 + }, + { + "completion_length": 1322.6666870117188, + "epoch": 0.42820121951219514, + "grad_norm": 1.8402271803724306, + "kl": 0.1142578125, + "learning_rate": 2.118697011227724e-06, + "loss": 0.0046, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2809 + }, + { + "completion_length": 1390.8333740234375, + "epoch": 0.42835365853658536, + "grad_norm": 0.28769917647791104, + "kl": 0.0810546875, + "learning_rate": 2.1179698129105592e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2810 + }, + { + "completion_length": 992.6666870117188, + "epoch": 0.42850609756097563, + "grad_norm": 0.10750674711288524, + "kl": 0.076171875, + "learning_rate": 2.117242439619081e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2811 + }, + { + "completion_length": 3378.166748046875, + "epoch": 0.42865853658536585, + "grad_norm": 0.04961399260457265, + "kl": 0.0526123046875, + "learning_rate": 2.11651489155924e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2812 + }, + { + "completion_length": 3452.83349609375, + "epoch": 0.4288109756097561, + "grad_norm": 0.04720088369178133, + "kl": 0.0418701171875, + "learning_rate": 2.115787168937038e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2813 + }, + { + "completion_length": 2020.666748046875, + "epoch": 0.42896341463414633, + "grad_norm": 0.08672149150168407, + "kl": 0.0738525390625, + "learning_rate": 2.115059271958524e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2814 + }, + { + "completion_length": 3392.666748046875, + "epoch": 0.4291158536585366, + "grad_norm": 0.050466476303037744, + "kl": 0.0374755859375, + "learning_rate": 2.1143312008297983e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2815 + }, + { + "completion_length": 2095.666748046875, + "epoch": 0.4292682926829268, + "grad_norm": 1.3982492841257872, + "kl": 0.0810546875, + "learning_rate": 2.11360295575701e-06, + "loss": 0.0032, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2816 + }, + { + "completion_length": 2579.5, + "epoch": 0.4294207317073171, + "grad_norm": 0.10531534960515036, + "kl": 0.06005859375, + "learning_rate": 2.112874536946356e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2817 + }, + { + "completion_length": 660.6666870117188, + "epoch": 0.4295731707317073, + "grad_norm": 0.16278885659762857, + "kl": 0.094970703125, + "learning_rate": 2.1121459446040847e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2818 + }, + { + "completion_length": 3679.166748046875, + "epoch": 0.4297256097560976, + "grad_norm": 0.044559117434661503, + "kl": 0.040771484375, + "learning_rate": 2.1114171789364916e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2819 + }, + { + "completion_length": 4096.0, + "epoch": 0.4298780487804878, + "grad_norm": 0.6891616340730375, + "kl": 0.0408935546875, + "learning_rate": 2.110688240149923e-06, + "loss": 0.0016, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2820 + }, + { + "completion_length": 4096.0, + "epoch": 0.43003048780487807, + "grad_norm": 0.04967037345768117, + "kl": 0.03973388671875, + "learning_rate": 2.1099591284507724e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2821 + }, + { + "completion_length": 2627.3333740234375, + "epoch": 0.4301829268292683, + "grad_norm": 0.11262192686522571, + "kl": 0.0699462890625, + "learning_rate": 2.1092298440454843e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2822 + }, + { + "completion_length": 2542.6666870117188, + "epoch": 0.43033536585365856, + "grad_norm": 0.07339645515368015, + "kl": 0.059326171875, + "learning_rate": 2.10850038714055e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2823 + }, + { + "completion_length": 3353.166748046875, + "epoch": 0.4304878048780488, + "grad_norm": 0.06850288569693284, + "kl": 0.0445556640625, + "learning_rate": 2.1077707579425114e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2824 + }, + { + "completion_length": 4096.0, + "epoch": 0.43064024390243905, + "grad_norm": 0.040817292485428196, + "kl": 0.0389404296875, + "learning_rate": 2.107040956657959e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2825 + }, + { + "completion_length": 2887.3333740234375, + "epoch": 0.43079268292682926, + "grad_norm": 0.07031261776099591, + "kl": 0.0592041015625, + "learning_rate": 2.10631098349353e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2826 + }, + { + "completion_length": 3490.8333740234375, + "epoch": 0.43094512195121953, + "grad_norm": 0.08138761134587802, + "kl": 0.0511474609375, + "learning_rate": 2.105580838655913e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2827 + }, + { + "completion_length": 3965.666748046875, + "epoch": 0.43109756097560975, + "grad_norm": 0.03792709773621178, + "kl": 0.0421142578125, + "learning_rate": 2.1048505223518433e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2828 + }, + { + "completion_length": 2486.6666870117188, + "epoch": 0.43125, + "grad_norm": 0.09707768806724361, + "kl": 0.07763671875, + "learning_rate": 2.104120034788106e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2829 + }, + { + "completion_length": 4062.0, + "epoch": 0.43140243902439024, + "grad_norm": 0.03738106454933179, + "kl": 0.0411376953125, + "learning_rate": 2.1033893761715335e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2830 + }, + { + "completion_length": 3502.5, + "epoch": 0.4315548780487805, + "grad_norm": 0.0967332962402863, + "kl": 0.064208984375, + "learning_rate": 2.1026585467090076e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2831 + }, + { + "completion_length": 1462.0000610351562, + "epoch": 0.4317073170731707, + "grad_norm": 0.1019395204158888, + "kl": 0.0784912109375, + "learning_rate": 2.1019275466074585e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2832 + }, + { + "completion_length": 3372.5, + "epoch": 0.431859756097561, + "grad_norm": 0.058063797920666176, + "kl": 0.054443359375, + "learning_rate": 2.1011963760738633e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2833 + }, + { + "completion_length": 1709.8333435058594, + "epoch": 0.4320121951219512, + "grad_norm": 0.12921682219351727, + "kl": 0.087158203125, + "learning_rate": 2.1004650353152495e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2834 + }, + { + "completion_length": 2804.3333740234375, + "epoch": 0.4321646341463415, + "grad_norm": 0.060377216521171354, + "kl": 0.0614013671875, + "learning_rate": 2.099733524538691e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2835 + }, + { + "completion_length": 2195.0000610351562, + "epoch": 0.4323170731707317, + "grad_norm": 0.09345849355487347, + "kl": 0.0687255859375, + "learning_rate": 2.0990018439513105e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2836 + }, + { + "completion_length": 3413.5001220703125, + "epoch": 0.432469512195122, + "grad_norm": 0.04967576132645029, + "kl": 0.0498046875, + "learning_rate": 2.0982699937602796e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2837 + }, + { + "completion_length": 3311.0, + "epoch": 0.4326219512195122, + "grad_norm": 0.05889724170491668, + "kl": 0.0533447265625, + "learning_rate": 2.0975379741728156e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2838 + }, + { + "completion_length": 2459.8333740234375, + "epoch": 0.43277439024390246, + "grad_norm": 0.07358810439351511, + "kl": 0.070068359375, + "learning_rate": 2.096805785396187e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2839 + }, + { + "completion_length": 2902.166748046875, + "epoch": 0.4329268292682927, + "grad_norm": 0.0794776440106118, + "kl": 0.0587158203125, + "learning_rate": 2.0960734276377082e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2840 + }, + { + "completion_length": 619.1666717529297, + "epoch": 0.43307926829268295, + "grad_norm": 0.12291586053353727, + "kl": 0.06982421875, + "learning_rate": 2.0953409011047404e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2841 + }, + { + "completion_length": 1446.1666870117188, + "epoch": 0.43323170731707317, + "grad_norm": 0.14448702123861076, + "kl": 0.085693359375, + "learning_rate": 2.0946082060046953e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2842 + }, + { + "completion_length": 1050.6667175292969, + "epoch": 0.43338414634146344, + "grad_norm": 0.09703742532126908, + "kl": 0.068115234375, + "learning_rate": 2.0938753425450307e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2843 + }, + { + "completion_length": 1335.3333740234375, + "epoch": 0.43353658536585366, + "grad_norm": 0.13179666699245335, + "kl": 0.10498046875, + "learning_rate": 2.093142310933252e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2844 + }, + { + "completion_length": 2370.166717529297, + "epoch": 0.4336890243902439, + "grad_norm": 0.10758376510064918, + "kl": 0.0743408203125, + "learning_rate": 2.092409111376913e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2845 + }, + { + "completion_length": 2410.666717529297, + "epoch": 0.43384146341463414, + "grad_norm": 0.06381297351524726, + "kl": 0.0533447265625, + "learning_rate": 2.091675744083614e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2846 + }, + { + "completion_length": 1742.3333740234375, + "epoch": 0.4339939024390244, + "grad_norm": 0.08094381933685026, + "kl": 0.084228515625, + "learning_rate": 2.0909422092610033e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2847 + }, + { + "completion_length": 2173.5, + "epoch": 0.43414634146341463, + "grad_norm": 0.06651075264624473, + "kl": 0.066162109375, + "learning_rate": 2.0902085071167774e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2848 + }, + { + "completion_length": 1807.0000610351562, + "epoch": 0.4342987804878049, + "grad_norm": 1.320047960341704, + "kl": 0.099853515625, + "learning_rate": 2.0894746378586795e-06, + "loss": 0.004, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2849 + }, + { + "completion_length": 797.8333740234375, + "epoch": 0.4344512195121951, + "grad_norm": 0.7189773160384552, + "kl": 0.080078125, + "learning_rate": 2.0887406016944995e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2850 + }, + { + "completion_length": 1520.8333740234375, + "epoch": 0.4346036585365854, + "grad_norm": 0.11159093024427358, + "kl": 0.0570068359375, + "learning_rate": 2.0880063988320763e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2851 + }, + { + "completion_length": 2894.666748046875, + "epoch": 0.4347560975609756, + "grad_norm": 0.07653815831236584, + "kl": 0.0654296875, + "learning_rate": 2.0872720294792936e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2852 + }, + { + "completion_length": 2056.0001220703125, + "epoch": 0.4349085365853659, + "grad_norm": 1.8750016329083838, + "kl": 0.086669921875, + "learning_rate": 2.086537493844084e-06, + "loss": 0.0035, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2853 + }, + { + "completion_length": 1390.6666870117188, + "epoch": 0.4350609756097561, + "grad_norm": 1.8243428523331515, + "kl": 0.066162109375, + "learning_rate": 2.0858027921344266e-06, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2854 + }, + { + "completion_length": 2964.666748046875, + "epoch": 0.43521341463414637, + "grad_norm": 0.05259688396003096, + "kl": 0.0653076171875, + "learning_rate": 2.0850679245583483e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2855 + }, + { + "completion_length": 1589.0000610351562, + "epoch": 0.4353658536585366, + "grad_norm": 0.11004810872266284, + "kl": 0.09375, + "learning_rate": 2.0843328913239216e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2856 + }, + { + "completion_length": 1633.5000915527344, + "epoch": 0.43551829268292686, + "grad_norm": 0.1147062884401357, + "kl": 0.077392578125, + "learning_rate": 2.083597692639266e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2857 + }, + { + "completion_length": 2014.3334350585938, + "epoch": 0.43567073170731707, + "grad_norm": 0.1230493717328075, + "kl": 0.065673828125, + "learning_rate": 2.08286232871255e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2858 + }, + { + "completion_length": 3186.8333740234375, + "epoch": 0.43582317073170734, + "grad_norm": 0.05927266429705131, + "kl": 0.0504150390625, + "learning_rate": 2.082126799751986e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2859 + }, + { + "completion_length": 1616.0000915527344, + "epoch": 0.43597560975609756, + "grad_norm": 0.12286153930645032, + "kl": 0.07958984375, + "learning_rate": 2.081391105965836e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2860 + }, + { + "completion_length": 3886.5, + "epoch": 0.43612804878048783, + "grad_norm": 0.04041417237044382, + "kl": 0.045654296875, + "learning_rate": 2.080655247562405e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2861 + }, + { + "completion_length": 2075.0, + "epoch": 0.43628048780487805, + "grad_norm": 0.07900157203742587, + "kl": 0.0504150390625, + "learning_rate": 2.079919224750048e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2862 + }, + { + "completion_length": 3797.166748046875, + "epoch": 0.4364329268292683, + "grad_norm": 0.8074257831560987, + "kl": 0.0421142578125, + "learning_rate": 2.0791830377371657e-06, + "loss": 0.0017, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2863 + }, + { + "completion_length": 2165.166717529297, + "epoch": 0.43658536585365854, + "grad_norm": 0.09871683449822073, + "kl": 0.0572509765625, + "learning_rate": 2.0784466867322037e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2864 + }, + { + "completion_length": 2180.0, + "epoch": 0.4367378048780488, + "grad_norm": 0.9275032075593677, + "kl": 0.05224609375, + "learning_rate": 2.0777101719436563e-06, + "loss": 0.0021, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2865 + }, + { + "completion_length": 2981.0001220703125, + "epoch": 0.436890243902439, + "grad_norm": 0.09123628413551738, + "kl": 0.0733642578125, + "learning_rate": 2.0769734935800618e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2866 + }, + { + "completion_length": 814.0000305175781, + "epoch": 0.4370426829268293, + "grad_norm": 0.0897632481679324, + "kl": 0.060546875, + "learning_rate": 2.0762366518500075e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2867 + }, + { + "completion_length": 2408.3334350585938, + "epoch": 0.4371951219512195, + "grad_norm": 0.15636362163649584, + "kl": 0.079833984375, + "learning_rate": 2.075499646962125e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2868 + }, + { + "completion_length": 1531.1666870117188, + "epoch": 0.4373475609756098, + "grad_norm": 0.23565528628559868, + "kl": 0.087158203125, + "learning_rate": 2.0747624791250928e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2869 + }, + { + "completion_length": 735.3333740234375, + "epoch": 0.4375, + "grad_norm": 1.9890440746766305, + "kl": 0.101806640625, + "learning_rate": 2.074025148547635e-06, + "loss": 0.0041, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2870 + }, + { + "completion_length": 2899.0001220703125, + "epoch": 0.4376524390243902, + "grad_norm": 0.08065263588672111, + "kl": 0.0660400390625, + "learning_rate": 2.073287655438522e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2871 + }, + { + "completion_length": 987.5000610351562, + "epoch": 0.4378048780487805, + "grad_norm": 0.1523036180231642, + "kl": 0.0706787109375, + "learning_rate": 2.0725500000065715e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2872 + }, + { + "completion_length": 2366.0000915527344, + "epoch": 0.4379573170731707, + "grad_norm": 0.08309641627859281, + "kl": 0.0687255859375, + "learning_rate": 2.0718121824606446e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2873 + }, + { + "completion_length": 3301.666748046875, + "epoch": 0.438109756097561, + "grad_norm": 0.0738476075787068, + "kl": 0.0604248046875, + "learning_rate": 2.071074203009651e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2874 + }, + { + "completion_length": 807.3333740234375, + "epoch": 0.4382621951219512, + "grad_norm": 0.22429538042631875, + "kl": 0.091064453125, + "learning_rate": 2.070336061862544e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2875 + }, + { + "completion_length": 1965.5, + "epoch": 0.43841463414634146, + "grad_norm": 0.2318294143879576, + "kl": 0.105224609375, + "learning_rate": 2.0695977592283246e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2876 + }, + { + "completion_length": 1422.1667175292969, + "epoch": 0.4385670731707317, + "grad_norm": 0.14339679415942327, + "kl": 0.0927734375, + "learning_rate": 2.0688592953160378e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2877 + }, + { + "completion_length": 1739.0001220703125, + "epoch": 0.43871951219512195, + "grad_norm": 0.18646777198205594, + "kl": 0.090087890625, + "learning_rate": 2.0681206703347758e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2878 + }, + { + "completion_length": 936.3333435058594, + "epoch": 0.43887195121951217, + "grad_norm": 0.1423215204163098, + "kl": 0.074462890625, + "learning_rate": 2.067381884493675e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2879 + }, + { + "completion_length": 2374.0, + "epoch": 0.43902439024390244, + "grad_norm": 0.06963267979768736, + "kl": 0.0595703125, + "learning_rate": 2.0666429380019185e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2880 + }, + { + "completion_length": 1863.3333435058594, + "epoch": 0.43917682926829266, + "grad_norm": 0.17172696094762327, + "kl": 0.085693359375, + "learning_rate": 2.0659038310687346e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2881 + }, + { + "completion_length": 1634.8333740234375, + "epoch": 0.43932926829268293, + "grad_norm": 0.09667795329143969, + "kl": 0.08154296875, + "learning_rate": 2.065164563903396e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2882 + }, + { + "completion_length": 1073.1667175292969, + "epoch": 0.43948170731707314, + "grad_norm": 0.23771794541553162, + "kl": 0.08837890625, + "learning_rate": 2.0644251367152226e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2883 + }, + { + "completion_length": 970.5, + "epoch": 0.4396341463414634, + "grad_norm": 0.09426794693979423, + "kl": 0.0616455078125, + "learning_rate": 2.0636855497135772e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2884 + }, + { + "completion_length": 3238.5, + "epoch": 0.43978658536585363, + "grad_norm": 0.04984859259495144, + "kl": 0.0552978515625, + "learning_rate": 2.0629458031078705e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2885 + }, + { + "completion_length": 2006.5000610351562, + "epoch": 0.4399390243902439, + "grad_norm": 2.380755961381901, + "kl": 0.1029052734375, + "learning_rate": 2.062205897107557e-06, + "loss": 0.0041, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2886 + }, + { + "completion_length": 1440.3333740234375, + "epoch": 0.4400914634146341, + "grad_norm": 0.16606468428552937, + "kl": 0.0772705078125, + "learning_rate": 2.0614658319221363e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2887 + }, + { + "completion_length": 2597.0, + "epoch": 0.4402439024390244, + "grad_norm": 2.1035459113368487, + "kl": 0.0689697265625, + "learning_rate": 2.060725607761153e-06, + "loss": 0.0028, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2888 + }, + { + "completion_length": 1809.0000610351562, + "epoch": 0.4403963414634146, + "grad_norm": 0.18397859002650227, + "kl": 0.10791015625, + "learning_rate": 2.0599852248341974e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2889 + }, + { + "completion_length": 2092.0000610351562, + "epoch": 0.4405487804878049, + "grad_norm": 0.08505557935741757, + "kl": 0.0660400390625, + "learning_rate": 2.0592446833509047e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2890 + }, + { + "completion_length": 1358.0000610351562, + "epoch": 0.4407012195121951, + "grad_norm": 0.10914541646653682, + "kl": 0.0760498046875, + "learning_rate": 2.0585039835209537e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2891 + }, + { + "completion_length": 2066.8334350585938, + "epoch": 0.44085365853658537, + "grad_norm": 0.12408109515085891, + "kl": 0.0732421875, + "learning_rate": 2.0577631255540692e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2892 + }, + { + "completion_length": 884.1666870117188, + "epoch": 0.4410060975609756, + "grad_norm": 0.1613290607294743, + "kl": 0.07080078125, + "learning_rate": 2.0570221096600208e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2893 + }, + { + "completion_length": 3066.0001220703125, + "epoch": 0.44115853658536586, + "grad_norm": 0.08347325906463117, + "kl": 0.058349609375, + "learning_rate": 2.0562809360486222e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2894 + }, + { + "completion_length": 1484.1666870117188, + "epoch": 0.4413109756097561, + "grad_norm": 0.12283862163850745, + "kl": 0.0869140625, + "learning_rate": 2.0555396049297323e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2895 + }, + { + "completion_length": 1985.8333435058594, + "epoch": 0.44146341463414634, + "grad_norm": 0.153809436968694, + "kl": 0.072265625, + "learning_rate": 2.0547981165132547e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2896 + }, + { + "completion_length": 2236.3333740234375, + "epoch": 0.44161585365853656, + "grad_norm": 0.8757574891940578, + "kl": 0.078857421875, + "learning_rate": 2.0540564710091366e-06, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2897 + }, + { + "completion_length": 2171.3333740234375, + "epoch": 0.44176829268292683, + "grad_norm": 0.10259372481008432, + "kl": 0.077392578125, + "learning_rate": 2.053314668627371e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2898 + }, + { + "completion_length": 3089.666748046875, + "epoch": 0.44192073170731705, + "grad_norm": 0.061413349295634675, + "kl": 0.062255859375, + "learning_rate": 2.0525727095779946e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2899 + }, + { + "completion_length": 1956.666748046875, + "epoch": 0.4420731707317073, + "grad_norm": 1.6037418253180111, + "kl": 0.0908203125, + "learning_rate": 2.051830594071088e-06, + "loss": 0.0036, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 2900 + }, + { + "completion_length": 964.8333435058594, + "epoch": 0.44222560975609754, + "grad_norm": 0.15631434726717172, + "kl": 0.100341796875, + "learning_rate": 2.0510883223167767e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2901 + }, + { + "completion_length": 2857.6666870117188, + "epoch": 0.4423780487804878, + "grad_norm": 0.2954569063891475, + "kl": 0.0736083984375, + "learning_rate": 2.0503458945252302e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2902 + }, + { + "completion_length": 1702.0001220703125, + "epoch": 0.442530487804878, + "grad_norm": 0.3109278454106705, + "kl": 0.092529296875, + "learning_rate": 2.0496033109066626e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2903 + }, + { + "completion_length": 2966.5001220703125, + "epoch": 0.4426829268292683, + "grad_norm": 0.06588862941087426, + "kl": 0.0653076171875, + "learning_rate": 2.048860571671332e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2904 + }, + { + "completion_length": 2835.1666870117188, + "epoch": 0.4428353658536585, + "grad_norm": 0.08485537471861725, + "kl": 0.06396484375, + "learning_rate": 2.0481176770295403e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2905 + }, + { + "completion_length": 2824.5, + "epoch": 0.4429878048780488, + "grad_norm": 0.06864467418722763, + "kl": 0.0626220703125, + "learning_rate": 2.047374627191633e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2906 + }, + { + "completion_length": 2198.1666870117188, + "epoch": 0.443140243902439, + "grad_norm": 0.10726300978640133, + "kl": 0.078369140625, + "learning_rate": 2.0466314223680005e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2907 + }, + { + "completion_length": 989.8333740234375, + "epoch": 0.4432926829268293, + "grad_norm": 0.14520209588780184, + "kl": 0.07958984375, + "learning_rate": 2.045888062769077e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2908 + }, + { + "completion_length": 3541.166748046875, + "epoch": 0.4434451219512195, + "grad_norm": 0.08195563963143594, + "kl": 0.07958984375, + "learning_rate": 2.0451445486053392e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2909 + }, + { + "completion_length": 1817.1666870117188, + "epoch": 0.44359756097560976, + "grad_norm": 0.12325220966765829, + "kl": 0.072509765625, + "learning_rate": 2.0444008800873097e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2910 + }, + { + "completion_length": 3808.666748046875, + "epoch": 0.44375, + "grad_norm": 0.04004997660375973, + "kl": 0.05078125, + "learning_rate": 2.0436570574255523e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2911 + }, + { + "completion_length": 2720.0001220703125, + "epoch": 0.44390243902439025, + "grad_norm": 0.10017647200921204, + "kl": 0.0584716796875, + "learning_rate": 2.0429130808306767e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2912 + }, + { + "completion_length": 2403.3333740234375, + "epoch": 0.44405487804878047, + "grad_norm": 0.1489081943441442, + "kl": 0.0859375, + "learning_rate": 2.0421689505133353e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2913 + }, + { + "completion_length": 1896.166748046875, + "epoch": 0.44420731707317074, + "grad_norm": 0.1201695815424395, + "kl": 0.079833984375, + "learning_rate": 2.0414246666842234e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2914 + }, + { + "completion_length": 1934.1667175292969, + "epoch": 0.44435975609756095, + "grad_norm": 0.11960973388138929, + "kl": 0.084228515625, + "learning_rate": 2.0406802295540802e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2915 + }, + { + "completion_length": 2602.0001220703125, + "epoch": 0.4445121951219512, + "grad_norm": 0.06329983598853194, + "kl": 0.0640869140625, + "learning_rate": 2.03993563933369e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2916 + }, + { + "completion_length": 1345.1667175292969, + "epoch": 0.44466463414634144, + "grad_norm": 2.428977647404776, + "kl": 0.108642578125, + "learning_rate": 2.0391908962338767e-06, + "loss": 0.0043, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2917 + }, + { + "completion_length": 3481.5, + "epoch": 0.4448170731707317, + "grad_norm": 0.05360401866685628, + "kl": 0.0531005859375, + "learning_rate": 2.038446000465511e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2918 + }, + { + "completion_length": 3797.666748046875, + "epoch": 0.44496951219512193, + "grad_norm": 0.042190960698703164, + "kl": 0.0511474609375, + "learning_rate": 2.0377009522395054e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2919 + }, + { + "completion_length": 3770.166748046875, + "epoch": 0.4451219512195122, + "grad_norm": 0.03720309538734341, + "kl": 0.0528564453125, + "learning_rate": 2.036955751766815e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2920 + }, + { + "completion_length": 1835.1667175292969, + "epoch": 0.4452743902439024, + "grad_norm": 0.17594753732403381, + "kl": 0.10595703125, + "learning_rate": 2.0362103992584397e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2921 + }, + { + "completion_length": 608.0000305175781, + "epoch": 0.4454268292682927, + "grad_norm": 0.14895117666865762, + "kl": 0.098388671875, + "learning_rate": 2.0354648949254205e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2922 + }, + { + "completion_length": 2904.3333740234375, + "epoch": 0.4455792682926829, + "grad_norm": 0.04996697971015156, + "kl": 0.0452880859375, + "learning_rate": 2.034719238978843e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2923 + }, + { + "completion_length": 1901.0, + "epoch": 0.4457317073170732, + "grad_norm": 0.16590623386812756, + "kl": 0.072509765625, + "learning_rate": 2.033973431629835e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2924 + }, + { + "completion_length": 2576.0, + "epoch": 0.4458841463414634, + "grad_norm": 0.12445482284863041, + "kl": 0.07568359375, + "learning_rate": 2.033227473089567e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2925 + }, + { + "completion_length": 2108.666748046875, + "epoch": 0.44603658536585367, + "grad_norm": 0.1466136347892259, + "kl": 0.0694580078125, + "learning_rate": 2.0324813635692526e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2926 + }, + { + "completion_length": 1422.5000305175781, + "epoch": 0.4461890243902439, + "grad_norm": 0.08324866347429273, + "kl": 0.053955078125, + "learning_rate": 2.0317351032801476e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2927 + }, + { + "completion_length": 2812.1666870117188, + "epoch": 0.44634146341463415, + "grad_norm": 0.07703692061917736, + "kl": 0.0748291015625, + "learning_rate": 2.030988692433552e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2928 + }, + { + "completion_length": 800.5000457763672, + "epoch": 0.44649390243902437, + "grad_norm": 1.564222745066988, + "kl": 0.159912109375, + "learning_rate": 2.0302421312408056e-06, + "loss": 0.0064, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2929 + }, + { + "completion_length": 2673.666748046875, + "epoch": 0.44664634146341464, + "grad_norm": 0.14096621304581133, + "kl": 0.071044921875, + "learning_rate": 2.029495419913295e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2930 + }, + { + "completion_length": 2846.0001220703125, + "epoch": 0.44679878048780486, + "grad_norm": 0.08803871621996452, + "kl": 0.07080078125, + "learning_rate": 2.028748558662445e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2931 + }, + { + "completion_length": 3311.666748046875, + "epoch": 0.44695121951219513, + "grad_norm": 0.05327743390527723, + "kl": 0.0633544921875, + "learning_rate": 2.0280015476997256e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2932 + }, + { + "completion_length": 3809.166748046875, + "epoch": 0.44710365853658535, + "grad_norm": 0.03355752071729692, + "kl": 0.046875, + "learning_rate": 2.0272543872366477e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2933 + }, + { + "completion_length": 3655.166748046875, + "epoch": 0.4472560975609756, + "grad_norm": 0.03870016361669563, + "kl": 0.052978515625, + "learning_rate": 2.026507077484766e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2934 + }, + { + "completion_length": 2809.3333740234375, + "epoch": 0.44740853658536583, + "grad_norm": 0.0768697085033969, + "kl": 0.0626220703125, + "learning_rate": 2.025759618655676e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2935 + }, + { + "completion_length": 4096.0, + "epoch": 0.4475609756097561, + "grad_norm": 0.03211758699676972, + "kl": 0.04541015625, + "learning_rate": 2.0250120109610155e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2936 + }, + { + "completion_length": 1834.8334350585938, + "epoch": 0.4477134146341463, + "grad_norm": 0.1767887584054138, + "kl": 0.0849609375, + "learning_rate": 2.024264254612466e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2937 + }, + { + "completion_length": 4096.0, + "epoch": 0.4478658536585366, + "grad_norm": 0.0383868311021118, + "kl": 0.0479736328125, + "learning_rate": 2.02351634982175e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2938 + }, + { + "completion_length": 3716.3333740234375, + "epoch": 0.4480182926829268, + "grad_norm": 0.044202283419420604, + "kl": 0.0570068359375, + "learning_rate": 2.0227682968006313e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2939 + }, + { + "completion_length": 4055.0, + "epoch": 0.4481707317073171, + "grad_norm": 0.136513504700261, + "kl": 0.0479736328125, + "learning_rate": 2.0220200957609172e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2940 + }, + { + "completion_length": 2987.3333740234375, + "epoch": 0.4483231707317073, + "grad_norm": 0.2760357441809317, + "kl": 0.088623046875, + "learning_rate": 2.0212717469144557e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2941 + }, + { + "completion_length": 1564.1666870117188, + "epoch": 0.44847560975609757, + "grad_norm": 0.1543194460044213, + "kl": 0.078369140625, + "learning_rate": 2.0205232504731376e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2942 + }, + { + "completion_length": 4096.0, + "epoch": 0.4486280487804878, + "grad_norm": 0.061688588415928614, + "kl": 0.0489501953125, + "learning_rate": 2.0197746066488947e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2943 + }, + { + "completion_length": 2524.666748046875, + "epoch": 0.44878048780487806, + "grad_norm": 0.08875416791125941, + "kl": 0.0675048828125, + "learning_rate": 2.019025815653701e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2944 + }, + { + "completion_length": 3961.8333740234375, + "epoch": 0.4489329268292683, + "grad_norm": 0.04671115221481262, + "kl": 0.0577392578125, + "learning_rate": 2.018276877699572e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2945 + }, + { + "completion_length": 2386.1666870117188, + "epoch": 0.44908536585365855, + "grad_norm": 0.0780482166780699, + "kl": 0.0711669921875, + "learning_rate": 2.0175277929985644e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2946 + }, + { + "completion_length": 876.5000305175781, + "epoch": 0.44923780487804876, + "grad_norm": 0.11417044786741698, + "kl": 0.078125, + "learning_rate": 2.016778561762777e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2947 + }, + { + "completion_length": 1700.1666870117188, + "epoch": 0.44939024390243903, + "grad_norm": 1.045238835711254, + "kl": 0.08203125, + "learning_rate": 2.016029184204351e-06, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2948 + }, + { + "completion_length": 2273.166748046875, + "epoch": 0.44954268292682925, + "grad_norm": 0.07027044639724044, + "kl": 0.0611572265625, + "learning_rate": 2.015279660535466e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2949 + }, + { + "completion_length": 1550.1666870117188, + "epoch": 0.4496951219512195, + "grad_norm": 0.11076271440907591, + "kl": 0.076416015625, + "learning_rate": 2.0145299909683476e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2950 + }, + { + "completion_length": 2179.1666717529297, + "epoch": 0.44984756097560974, + "grad_norm": 0.10421644735096552, + "kl": 0.075927734375, + "learning_rate": 2.0137801757152578e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2951 + }, + { + "completion_length": 3066.166748046875, + "epoch": 0.45, + "grad_norm": 0.10099532901427168, + "kl": 0.0697021484375, + "learning_rate": 2.0130302149885033e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2952 + }, + { + "completion_length": 2040.8333740234375, + "epoch": 0.4501524390243902, + "grad_norm": 0.15020949903154637, + "kl": 0.085205078125, + "learning_rate": 2.0122801090004302e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2953 + }, + { + "completion_length": 2407.3334350585938, + "epoch": 0.4503048780487805, + "grad_norm": 0.058923690699251664, + "kl": 0.0517578125, + "learning_rate": 2.011529857963427e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2954 + }, + { + "completion_length": 1217.0000305175781, + "epoch": 0.4504573170731707, + "grad_norm": 0.16704675460958957, + "kl": 0.093017578125, + "learning_rate": 2.010779462089922e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2955 + }, + { + "completion_length": 675.0000305175781, + "epoch": 0.450609756097561, + "grad_norm": 0.21660840227170083, + "kl": 0.07763671875, + "learning_rate": 2.0100289215923856e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2956 + }, + { + "completion_length": 3074.3333740234375, + "epoch": 0.4507621951219512, + "grad_norm": 0.042828669469528395, + "kl": 0.0579833984375, + "learning_rate": 2.0092782366833284e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2957 + }, + { + "completion_length": 1297.3333740234375, + "epoch": 0.4509146341463415, + "grad_norm": 0.1470247685118318, + "kl": 0.090576171875, + "learning_rate": 2.008527407575302e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2958 + }, + { + "completion_length": 1591.8333740234375, + "epoch": 0.4510670731707317, + "grad_norm": 0.14540645376360115, + "kl": 0.10205078125, + "learning_rate": 2.0077764344808995e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2959 + }, + { + "completion_length": 1750.1667175292969, + "epoch": 0.45121951219512196, + "grad_norm": 0.09140717228075597, + "kl": 0.0771484375, + "learning_rate": 2.007025317612754e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2960 + }, + { + "completion_length": 1999.5000610351562, + "epoch": 0.4513719512195122, + "grad_norm": 0.11588014005359543, + "kl": 0.087646484375, + "learning_rate": 2.0062740571835397e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2961 + }, + { + "completion_length": 1994.3333740234375, + "epoch": 0.45152439024390245, + "grad_norm": 0.11600863321607434, + "kl": 0.0682373046875, + "learning_rate": 2.0055226534059715e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2962 + }, + { + "completion_length": 2095.8333740234375, + "epoch": 0.45167682926829267, + "grad_norm": 0.08678288911077481, + "kl": 0.078369140625, + "learning_rate": 2.004771106492804e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2963 + }, + { + "completion_length": 1619.3333740234375, + "epoch": 0.45182926829268294, + "grad_norm": 0.12457574419769255, + "kl": 0.07666015625, + "learning_rate": 2.0040194166568337e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2964 + }, + { + "completion_length": 1530.666748046875, + "epoch": 0.45198170731707316, + "grad_norm": 0.10566472066526271, + "kl": 0.072265625, + "learning_rate": 2.003267584110896e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2965 + }, + { + "completion_length": 958.3333435058594, + "epoch": 0.4521341463414634, + "grad_norm": 0.11500206738135298, + "kl": 0.064697265625, + "learning_rate": 2.0025156090678694e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2966 + }, + { + "completion_length": 1760.666748046875, + "epoch": 0.45228658536585364, + "grad_norm": 0.12486380725272336, + "kl": 0.0908203125, + "learning_rate": 2.0017634917406683e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2967 + }, + { + "completion_length": 1443.0000610351562, + "epoch": 0.4524390243902439, + "grad_norm": 0.11983513122857102, + "kl": 0.0653076171875, + "learning_rate": 2.001011232342253e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2968 + }, + { + "completion_length": 1335.1666870117188, + "epoch": 0.45259146341463413, + "grad_norm": 1.2551554698667224, + "kl": 0.111572265625, + "learning_rate": 2.000258831085619e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2969 + }, + { + "completion_length": 3627.666748046875, + "epoch": 0.4527439024390244, + "grad_norm": 0.18015376542912787, + "kl": 0.062255859375, + "learning_rate": 1.9995062881838053e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2970 + }, + { + "completion_length": 2442.3333740234375, + "epoch": 0.4528963414634146, + "grad_norm": 0.06593706902998159, + "kl": 0.056396484375, + "learning_rate": 1.9987536038498885e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2971 + }, + { + "completion_length": 3024.5001220703125, + "epoch": 0.4530487804878049, + "grad_norm": 0.053921093293050594, + "kl": 0.0570068359375, + "learning_rate": 1.9980007782969882e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2972 + }, + { + "completion_length": 1506.3333740234375, + "epoch": 0.4532012195121951, + "grad_norm": 0.12290632518235889, + "kl": 0.073486328125, + "learning_rate": 1.99724781173826e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2973 + }, + { + "completion_length": 1927.5000610351562, + "epoch": 0.4533536585365854, + "grad_norm": 1.6954971130731356, + "kl": 0.0634765625, + "learning_rate": 1.996494704386903e-06, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2974 + }, + { + "completion_length": 1084.1667175292969, + "epoch": 0.4535060975609756, + "grad_norm": 0.11543102190153962, + "kl": 0.08203125, + "learning_rate": 1.995741456456156e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2975 + }, + { + "completion_length": 1786.0000915527344, + "epoch": 0.45365853658536587, + "grad_norm": 0.07771616550989574, + "kl": 0.0552978515625, + "learning_rate": 1.994988068159294e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2976 + }, + { + "completion_length": 1262.3333740234375, + "epoch": 0.4538109756097561, + "grad_norm": 0.16293184496422217, + "kl": 0.108642578125, + "learning_rate": 1.994234539709636e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2977 + }, + { + "completion_length": 984.6667175292969, + "epoch": 0.45396341463414636, + "grad_norm": 0.1189643926242317, + "kl": 0.08447265625, + "learning_rate": 1.9934808713205387e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2978 + }, + { + "completion_length": 845.0000305175781, + "epoch": 0.4541158536585366, + "grad_norm": 0.17823848213708957, + "kl": 0.0594482421875, + "learning_rate": 1.9927270632053983e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2979 + }, + { + "completion_length": 820.1666717529297, + "epoch": 0.45426829268292684, + "grad_norm": 0.10913186282217474, + "kl": 0.0560302734375, + "learning_rate": 1.9919731155776504e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2980 + }, + { + "completion_length": 3046.3333740234375, + "epoch": 0.45442073170731706, + "grad_norm": 0.08425348536648519, + "kl": 0.0810546875, + "learning_rate": 1.9912190286507713e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2981 + }, + { + "completion_length": 1733.666748046875, + "epoch": 0.45457317073170733, + "grad_norm": 0.11415327808319065, + "kl": 0.099609375, + "learning_rate": 1.9904648026382756e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2982 + }, + { + "completion_length": 1657.8333740234375, + "epoch": 0.45472560975609755, + "grad_norm": 0.0933553325174549, + "kl": 0.091796875, + "learning_rate": 1.9897104377537185e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2983 + }, + { + "completion_length": 2175.0, + "epoch": 0.4548780487804878, + "grad_norm": 0.09806868610468071, + "kl": 0.078369140625, + "learning_rate": 1.9889559342106926e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2984 + }, + { + "completion_length": 671.5000305175781, + "epoch": 0.45503048780487804, + "grad_norm": 2.1731376416417114, + "kl": 0.08984375, + "learning_rate": 1.988201292222832e-06, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2985 + }, + { + "completion_length": 900.5, + "epoch": 0.4551829268292683, + "grad_norm": 0.2133812182595002, + "kl": 0.08837890625, + "learning_rate": 1.987446512003808e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2986 + }, + { + "completion_length": 1328.8333740234375, + "epoch": 0.4553353658536585, + "grad_norm": 0.11613420760814994, + "kl": 0.090576171875, + "learning_rate": 1.9866915937673317e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2987 + }, + { + "completion_length": 1130.6666870117188, + "epoch": 0.4554878048780488, + "grad_norm": 0.1329856516189875, + "kl": 0.0810546875, + "learning_rate": 1.985936537727155e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2988 + }, + { + "completion_length": 1498.0000305175781, + "epoch": 0.455640243902439, + "grad_norm": 0.3553985800038229, + "kl": 0.0908203125, + "learning_rate": 1.985181344097066e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 2989 + }, + { + "completion_length": 973.5, + "epoch": 0.4557926829268293, + "grad_norm": 0.1886227171294477, + "kl": 0.0791015625, + "learning_rate": 1.984426013090894e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2990 + }, + { + "completion_length": 1075.1667175292969, + "epoch": 0.4559451219512195, + "grad_norm": 0.12500921597054065, + "kl": 0.0908203125, + "learning_rate": 1.983670544922505e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2991 + }, + { + "completion_length": 1014.0000305175781, + "epoch": 0.4560975609756098, + "grad_norm": 0.10228391101488789, + "kl": 0.06982421875, + "learning_rate": 1.9829149398058068e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2992 + }, + { + "completion_length": 1744.3333740234375, + "epoch": 0.45625, + "grad_norm": 0.10644787221914441, + "kl": 0.07080078125, + "learning_rate": 1.9821591979547425e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2993 + }, + { + "completion_length": 1727.3333740234375, + "epoch": 0.45640243902439026, + "grad_norm": 1.4463397621931946, + "kl": 0.089599609375, + "learning_rate": 1.981403319583297e-06, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2994 + }, + { + "completion_length": 1101.0000305175781, + "epoch": 0.4565548780487805, + "grad_norm": 1.6784365669123062, + "kl": 0.13427734375, + "learning_rate": 1.980647304905492e-06, + "loss": 0.0054, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 2995 + }, + { + "completion_length": 1056.666748046875, + "epoch": 0.45670731707317075, + "grad_norm": 0.10549345480218655, + "kl": 0.0753173828125, + "learning_rate": 1.9798911541353882e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2996 + }, + { + "completion_length": 701.0, + "epoch": 0.45685975609756097, + "grad_norm": 0.2609404888508607, + "kl": 0.08349609375, + "learning_rate": 1.979134867487086e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2997 + }, + { + "completion_length": 2188.3334350585938, + "epoch": 0.45701219512195124, + "grad_norm": 0.07405473306478796, + "kl": 0.074951171875, + "learning_rate": 1.978378445174722e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2998 + }, + { + "completion_length": 2102.3333740234375, + "epoch": 0.45716463414634145, + "grad_norm": 0.10606525991190416, + "kl": 0.091064453125, + "learning_rate": 1.977621887412474e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 2999 + }, + { + "completion_length": 919.6667175292969, + "epoch": 0.4573170731707317, + "grad_norm": 0.17638021910872412, + "kl": 0.091796875, + "learning_rate": 1.976865194414555e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3000 + }, + { + "completion_length": 892.1666870117188, + "epoch": 0.45746951219512194, + "grad_norm": 0.10106907764541477, + "kl": 0.062744140625, + "learning_rate": 1.976108366395219e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3001 + }, + { + "completion_length": 1353.0000305175781, + "epoch": 0.4576219512195122, + "grad_norm": 0.09921756721419099, + "kl": 0.08447265625, + "learning_rate": 1.975351403568756e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3002 + }, + { + "completion_length": 1483.6667175292969, + "epoch": 0.45777439024390243, + "grad_norm": 0.12155714702096931, + "kl": 0.086181640625, + "learning_rate": 1.9745943061494967e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3003 + }, + { + "completion_length": 1914.6666870117188, + "epoch": 0.4579268292682927, + "grad_norm": 0.14430845983907478, + "kl": 0.0908203125, + "learning_rate": 1.9738370743518076e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3004 + }, + { + "completion_length": 1450.1666870117188, + "epoch": 0.4580792682926829, + "grad_norm": 1.5506232397290054, + "kl": 0.08203125, + "learning_rate": 1.9730797083900945e-06, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3005 + }, + { + "completion_length": 997.6666870117188, + "epoch": 0.4582317073170732, + "grad_norm": 0.1445580603288004, + "kl": 0.076416015625, + "learning_rate": 1.9723222084788013e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3006 + }, + { + "completion_length": 3218.166748046875, + "epoch": 0.4583841463414634, + "grad_norm": 0.06591774134213667, + "kl": 0.06787109375, + "learning_rate": 1.971564574832409e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3007 + }, + { + "completion_length": 1008.0000305175781, + "epoch": 0.4585365853658537, + "grad_norm": 0.18371380250846303, + "kl": 0.076416015625, + "learning_rate": 1.9708068076654364e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3008 + }, + { + "completion_length": 2339.8333435058594, + "epoch": 0.4586890243902439, + "grad_norm": 0.5993952098992528, + "kl": 0.1007080078125, + "learning_rate": 1.9700489071924406e-06, + "loss": 0.004, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3009 + }, + { + "completion_length": 1000.0000305175781, + "epoch": 0.45884146341463417, + "grad_norm": 0.14777099741756003, + "kl": 0.079345703125, + "learning_rate": 1.969290873628018e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3010 + }, + { + "completion_length": 1209.8333740234375, + "epoch": 0.4589939024390244, + "grad_norm": 0.4908787755023371, + "kl": 0.0908203125, + "learning_rate": 1.968532707186799e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3011 + }, + { + "completion_length": 2705.166748046875, + "epoch": 0.45914634146341465, + "grad_norm": 0.08017374634901318, + "kl": 0.08251953125, + "learning_rate": 1.9677744080834547e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3012 + }, + { + "completion_length": 2969.3333740234375, + "epoch": 0.45929878048780487, + "grad_norm": 0.05838027090068112, + "kl": 0.065673828125, + "learning_rate": 1.9670159765326926e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3013 + }, + { + "completion_length": 1944.1666870117188, + "epoch": 0.45945121951219514, + "grad_norm": 0.19589124644723482, + "kl": 0.0784912109375, + "learning_rate": 1.966257412749258e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3014 + }, + { + "completion_length": 2302.3334350585938, + "epoch": 0.45960365853658536, + "grad_norm": 0.0768996267624877, + "kl": 0.071044921875, + "learning_rate": 1.9654987169479337e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3015 + }, + { + "completion_length": 3017.166748046875, + "epoch": 0.45975609756097563, + "grad_norm": 1.0397521459415966, + "kl": 0.098388671875, + "learning_rate": 1.9647398893435394e-06, + "loss": 0.0039, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3016 + }, + { + "completion_length": 1208.6666870117188, + "epoch": 0.45990853658536585, + "grad_norm": 0.17842160547588173, + "kl": 0.054443359375, + "learning_rate": 1.963980930150933e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3017 + }, + { + "completion_length": 1715.1666870117188, + "epoch": 0.4600609756097561, + "grad_norm": 1.0142089910246348, + "kl": 0.078369140625, + "learning_rate": 1.963221839585008e-06, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3018 + }, + { + "completion_length": 3897.166748046875, + "epoch": 0.46021341463414633, + "grad_norm": 0.03399935344807216, + "kl": 0.0411376953125, + "learning_rate": 1.962462617860697e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3019 + }, + { + "completion_length": 2174.6666870117188, + "epoch": 0.4603658536585366, + "grad_norm": 0.08780215788693584, + "kl": 0.0701904296875, + "learning_rate": 1.9617032651929686e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3020 + }, + { + "completion_length": 3949.5, + "epoch": 0.4605182926829268, + "grad_norm": 0.0334631551940992, + "kl": 0.0408935546875, + "learning_rate": 1.960943781796829e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3021 + }, + { + "completion_length": 3995.666748046875, + "epoch": 0.4606707317073171, + "grad_norm": 0.03717687160740519, + "kl": 0.0460205078125, + "learning_rate": 1.960184167887321e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3022 + }, + { + "completion_length": 3679.666748046875, + "epoch": 0.4608231707317073, + "grad_norm": 0.03485126447303973, + "kl": 0.04052734375, + "learning_rate": 1.9594244236795246e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3023 + }, + { + "completion_length": 2311.166748046875, + "epoch": 0.4609756097560976, + "grad_norm": 0.16302737998046857, + "kl": 0.076171875, + "learning_rate": 1.9586645493885565e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3024 + }, + { + "completion_length": 3185.3333740234375, + "epoch": 0.4611280487804878, + "grad_norm": 0.06506796039618412, + "kl": 0.0543212890625, + "learning_rate": 1.957904545229571e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3025 + }, + { + "completion_length": 2214.5, + "epoch": 0.46128048780487807, + "grad_norm": 0.08752588387307698, + "kl": 0.072509765625, + "learning_rate": 1.957144411417758e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3026 + }, + { + "completion_length": 2970.8333740234375, + "epoch": 0.4614329268292683, + "grad_norm": 0.06615379760056399, + "kl": 0.042724609375, + "learning_rate": 1.9563841481683445e-06, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3027 + }, + { + "completion_length": 2727.8333740234375, + "epoch": 0.46158536585365856, + "grad_norm": 0.052547190575582894, + "kl": 0.0469970703125, + "learning_rate": 1.9556237556965955e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3028 + }, + { + "completion_length": 3521.8333740234375, + "epoch": 0.4617378048780488, + "grad_norm": 0.04362602667297557, + "kl": 0.0413818359375, + "learning_rate": 1.9548632342178094e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3029 + }, + { + "completion_length": 3981.5, + "epoch": 0.46189024390243905, + "grad_norm": 0.7047942019037997, + "kl": 0.0400390625, + "learning_rate": 1.954102583947325e-06, + "loss": 0.0016, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3030 + }, + { + "completion_length": 2105.8333435058594, + "epoch": 0.46204268292682926, + "grad_norm": 0.12191996079499079, + "kl": 0.085205078125, + "learning_rate": 1.9533418051005148e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3031 + }, + { + "completion_length": 3840.5001220703125, + "epoch": 0.46219512195121953, + "grad_norm": 0.043099064264654124, + "kl": 0.04443359375, + "learning_rate": 1.9525808978927886e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3032 + }, + { + "completion_length": 3198.666748046875, + "epoch": 0.46234756097560975, + "grad_norm": 0.05179264792803703, + "kl": 0.0513916015625, + "learning_rate": 1.9518198625395925e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3033 + }, + { + "completion_length": 2042.5, + "epoch": 0.4625, + "grad_norm": 0.8044672150885017, + "kl": 0.0916748046875, + "learning_rate": 1.9510586992564096e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3034 + }, + { + "completion_length": 2549.3333740234375, + "epoch": 0.46265243902439024, + "grad_norm": 0.06742096926674748, + "kl": 0.0594482421875, + "learning_rate": 1.950297408258758e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3035 + }, + { + "completion_length": 2895.666748046875, + "epoch": 0.4628048780487805, + "grad_norm": 0.05922794968494331, + "kl": 0.046875, + "learning_rate": 1.9495359897621926e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3036 + }, + { + "completion_length": 1613.6666870117188, + "epoch": 0.4629573170731707, + "grad_norm": 0.11677504928833743, + "kl": 0.10498046875, + "learning_rate": 1.9487744439823046e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3037 + }, + { + "completion_length": 1281.5, + "epoch": 0.463109756097561, + "grad_norm": 0.17049081076809783, + "kl": 0.113525390625, + "learning_rate": 1.9480127711347203e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3038 + }, + { + "completion_length": 1956.5000610351562, + "epoch": 0.4632621951219512, + "grad_norm": 0.116055015884259, + "kl": 0.0548095703125, + "learning_rate": 1.9472509714351036e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3039 + }, + { + "completion_length": 2345.1666870117188, + "epoch": 0.4634146341463415, + "grad_norm": 0.13357839090129334, + "kl": 0.072021484375, + "learning_rate": 1.946489045099152e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3040 + }, + { + "completion_length": 1648.166748046875, + "epoch": 0.4635670731707317, + "grad_norm": 0.09703987468343307, + "kl": 0.0927734375, + "learning_rate": 1.9457269923426015e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3041 + }, + { + "completion_length": 1558.0000610351562, + "epoch": 0.463719512195122, + "grad_norm": 0.11737150158335069, + "kl": 0.1025390625, + "learning_rate": 1.9449648133812217e-06, + "loss": 0.0041, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3042 + }, + { + "completion_length": 1944.6666870117188, + "epoch": 0.4638719512195122, + "grad_norm": 3.290839360839676, + "kl": 0.123046875, + "learning_rate": 1.9442025084308193e-06, + "loss": 0.0049, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3043 + }, + { + "completion_length": 1856.8333740234375, + "epoch": 0.46402439024390246, + "grad_norm": 0.07873246539113239, + "kl": 0.0751953125, + "learning_rate": 1.9434400777072364e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3044 + }, + { + "completion_length": 1991.1667175292969, + "epoch": 0.4641768292682927, + "grad_norm": 0.10863153030959193, + "kl": 0.08154296875, + "learning_rate": 1.94267752142635e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3045 + }, + { + "completion_length": 1705.3333740234375, + "epoch": 0.46432926829268295, + "grad_norm": 1.1219697108294042, + "kl": 0.0809326171875, + "learning_rate": 1.9419148398040737e-06, + "loss": 0.0032, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 3046 + }, + { + "completion_length": 2061.0, + "epoch": 0.46448170731707317, + "grad_norm": 0.07606890255397378, + "kl": 0.0672607421875, + "learning_rate": 1.9411520330563558e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3047 + }, + { + "completion_length": 1840.3333740234375, + "epoch": 0.46463414634146344, + "grad_norm": 0.18066209067992045, + "kl": 0.0771484375, + "learning_rate": 1.94038910139918e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3048 + }, + { + "completion_length": 1286.6666870117188, + "epoch": 0.46478658536585366, + "grad_norm": 0.11486544002669236, + "kl": 0.080078125, + "learning_rate": 1.9396260450485663e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3049 + }, + { + "completion_length": 1264.0, + "epoch": 0.4649390243902439, + "grad_norm": 0.08833847237574251, + "kl": 0.07666015625, + "learning_rate": 1.938862864220569e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3050 + }, + { + "completion_length": 1698.3333740234375, + "epoch": 0.46509146341463414, + "grad_norm": 0.1048510670415454, + "kl": 0.053955078125, + "learning_rate": 1.9380995591312777e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3051 + }, + { + "completion_length": 2407.5001220703125, + "epoch": 0.4652439024390244, + "grad_norm": 0.07484086393212429, + "kl": 0.076171875, + "learning_rate": 1.9373361299968173e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3052 + }, + { + "completion_length": 1665.5000610351562, + "epoch": 0.46539634146341463, + "grad_norm": 0.13173742656537463, + "kl": 0.100341796875, + "learning_rate": 1.93657257703335e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3053 + }, + { + "completion_length": 1161.0000305175781, + "epoch": 0.4655487804878049, + "grad_norm": 0.1262740465149748, + "kl": 0.0635986328125, + "learning_rate": 1.9358089004570686e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3054 + }, + { + "completion_length": 1622.666748046875, + "epoch": 0.4657012195121951, + "grad_norm": 0.09491285033691163, + "kl": 0.07421875, + "learning_rate": 1.9350451004842045e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3055 + }, + { + "completion_length": 2644.166748046875, + "epoch": 0.4658536585365854, + "grad_norm": 0.08744430843095867, + "kl": 0.070068359375, + "learning_rate": 1.934281177331023e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3056 + }, + { + "completion_length": 1805.8333740234375, + "epoch": 0.4660060975609756, + "grad_norm": 0.12030136298744312, + "kl": 0.06884765625, + "learning_rate": 1.933517131213824e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3057 + }, + { + "completion_length": 1956.8333740234375, + "epoch": 0.4661585365853659, + "grad_norm": 1.3306775307284346, + "kl": 0.094970703125, + "learning_rate": 1.9327529623489426e-06, + "loss": 0.0038, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3058 + }, + { + "completion_length": 908.1666870117188, + "epoch": 0.4663109756097561, + "grad_norm": 0.15693124858999366, + "kl": 0.115234375, + "learning_rate": 1.931988670952748e-06, + "loss": 0.0046, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3059 + }, + { + "completion_length": 1582.1666870117188, + "epoch": 0.46646341463414637, + "grad_norm": 0.1276145753143193, + "kl": 0.06689453125, + "learning_rate": 1.9312242572416446e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3060 + }, + { + "completion_length": 1753.5, + "epoch": 0.4666158536585366, + "grad_norm": 1.7905132346487636, + "kl": 0.091064453125, + "learning_rate": 1.930459721432072e-06, + "loss": 0.0036, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3061 + }, + { + "completion_length": 1852.666748046875, + "epoch": 0.46676829268292686, + "grad_norm": 0.08706295690235806, + "kl": 0.0791015625, + "learning_rate": 1.9296950637405036e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3062 + }, + { + "completion_length": 806.5000305175781, + "epoch": 0.46692073170731707, + "grad_norm": 0.20328277955003424, + "kl": 0.088623046875, + "learning_rate": 1.9289302843834468e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3063 + }, + { + "completion_length": 3191.5001220703125, + "epoch": 0.46707317073170734, + "grad_norm": 0.051723995236550094, + "kl": 0.0533447265625, + "learning_rate": 1.928165383577445e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3064 + }, + { + "completion_length": 1080.0, + "epoch": 0.46722560975609756, + "grad_norm": 0.3813542704281078, + "kl": 0.097900390625, + "learning_rate": 1.927400361539074e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3065 + }, + { + "completion_length": 2241.8333740234375, + "epoch": 0.46737804878048783, + "grad_norm": 0.11920809186096605, + "kl": 0.072265625, + "learning_rate": 1.926635218484947e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3066 + }, + { + "completion_length": 2085.5001220703125, + "epoch": 0.46753048780487805, + "grad_norm": 0.10029379911455877, + "kl": 0.084716796875, + "learning_rate": 1.9258699546317076e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3067 + }, + { + "completion_length": 2484.8334350585938, + "epoch": 0.4676829268292683, + "grad_norm": 0.07674053288712941, + "kl": 0.0771484375, + "learning_rate": 1.925104570196036e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3068 + }, + { + "completion_length": 1528.5, + "epoch": 0.46783536585365854, + "grad_norm": 0.5043023433087399, + "kl": 0.14208984375, + "learning_rate": 1.9243390653946463e-06, + "loss": 0.0057, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3069 + }, + { + "completion_length": 2527.166748046875, + "epoch": 0.4679878048780488, + "grad_norm": 0.08023617064152389, + "kl": 0.072021484375, + "learning_rate": 1.923573440444286e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3070 + }, + { + "completion_length": 2288.3333740234375, + "epoch": 0.468140243902439, + "grad_norm": 0.1077686299153531, + "kl": 0.08984375, + "learning_rate": 1.922807695561738e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3071 + }, + { + "completion_length": 1053.1666870117188, + "epoch": 0.4682926829268293, + "grad_norm": 0.10874776399362487, + "kl": 0.0745849609375, + "learning_rate": 1.9220418309638175e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3072 + }, + { + "completion_length": 1730.0000610351562, + "epoch": 0.4684451219512195, + "grad_norm": 0.09784414776395754, + "kl": 0.088134765625, + "learning_rate": 1.9212758468673744e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3073 + }, + { + "completion_length": 1546.3333740234375, + "epoch": 0.4685975609756098, + "grad_norm": 0.19088272123478012, + "kl": 0.111083984375, + "learning_rate": 1.920509743489292e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3074 + }, + { + "completion_length": 543.5000305175781, + "epoch": 0.46875, + "grad_norm": 1.1375327042003693, + "kl": 0.130126953125, + "learning_rate": 1.9197435210464884e-06, + "loss": 0.0052, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3075 + }, + { + "completion_length": 943.8333740234375, + "epoch": 0.4689024390243902, + "grad_norm": 0.23594610032543584, + "kl": 0.118408203125, + "learning_rate": 1.9189771797559143e-06, + "loss": 0.0047, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3076 + }, + { + "completion_length": 1046.6666870117188, + "epoch": 0.4690548780487805, + "grad_norm": 0.3355486389428464, + "kl": 0.099609375, + "learning_rate": 1.918210719834554e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3077 + }, + { + "completion_length": 1235.5000305175781, + "epoch": 0.4692073170731707, + "grad_norm": 0.22641009300943737, + "kl": 0.10693359375, + "learning_rate": 1.917444141499427e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3078 + }, + { + "completion_length": 1099.1666870117188, + "epoch": 0.469359756097561, + "grad_norm": 0.09465990366303799, + "kl": 0.088623046875, + "learning_rate": 1.9166774449675845e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3079 + }, + { + "completion_length": 1699.5000915527344, + "epoch": 0.4695121951219512, + "grad_norm": 0.11438664477274267, + "kl": 0.085693359375, + "learning_rate": 1.915910630456112e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3080 + }, + { + "completion_length": 326.00001525878906, + "epoch": 0.46966463414634146, + "grad_norm": 0.25158980514839147, + "kl": 0.1357421875, + "learning_rate": 1.915143698182128e-06, + "loss": 0.0054, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3081 + }, + { + "completion_length": 1097.5000305175781, + "epoch": 0.4698170731707317, + "grad_norm": 0.15605349574497068, + "kl": 0.0966796875, + "learning_rate": 1.9143766483627853e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3082 + }, + { + "completion_length": 1752.3333435058594, + "epoch": 0.46996951219512195, + "grad_norm": 0.12544527914517728, + "kl": 0.081787109375, + "learning_rate": 1.9136094812152685e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3083 + }, + { + "completion_length": 1424.0000610351562, + "epoch": 0.47012195121951217, + "grad_norm": 2.3670652588941783, + "kl": 0.120361328125, + "learning_rate": 1.9128421969567964e-06, + "loss": 0.0048, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3084 + }, + { + "completion_length": 2211.5, + "epoch": 0.47027439024390244, + "grad_norm": 0.10648372498837996, + "kl": 0.109130859375, + "learning_rate": 1.912074795804621e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3085 + }, + { + "completion_length": 672.1666870117188, + "epoch": 0.47042682926829266, + "grad_norm": 0.5909956309111059, + "kl": 0.115234375, + "learning_rate": 1.911307277976027e-06, + "loss": 0.0046, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3086 + }, + { + "completion_length": 1534.5, + "epoch": 0.47057926829268293, + "grad_norm": 0.4667320905475585, + "kl": 0.08740234375, + "learning_rate": 1.910539643688333e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3087 + }, + { + "completion_length": 969.6666870117188, + "epoch": 0.47073170731707314, + "grad_norm": 1.7591407165316855, + "kl": 0.090087890625, + "learning_rate": 1.909771893158889e-06, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3088 + }, + { + "completion_length": 1736.1667175292969, + "epoch": 0.4708841463414634, + "grad_norm": 0.172958545933711, + "kl": 0.105224609375, + "learning_rate": 1.9090040266050787e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3089 + }, + { + "completion_length": 1749.0, + "epoch": 0.47103658536585363, + "grad_norm": 0.09434791819602596, + "kl": 0.075927734375, + "learning_rate": 1.908236044244319e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3090 + }, + { + "completion_length": 1075.6667175292969, + "epoch": 0.4711890243902439, + "grad_norm": 0.11475944384630388, + "kl": 0.102294921875, + "learning_rate": 1.90746794629406e-06, + "loss": 0.0041, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3091 + }, + { + "completion_length": 1548.166748046875, + "epoch": 0.4713414634146341, + "grad_norm": 0.13380431770515316, + "kl": 0.08837890625, + "learning_rate": 1.9066997329717833e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3092 + }, + { + "completion_length": 1149.3333740234375, + "epoch": 0.4714939024390244, + "grad_norm": 1.1212517370013952, + "kl": 0.1357421875, + "learning_rate": 1.9059314044950038e-06, + "loss": 0.0054, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3093 + }, + { + "completion_length": 1945.3334350585938, + "epoch": 0.4716463414634146, + "grad_norm": 0.08988980799303926, + "kl": 0.077880859375, + "learning_rate": 1.9051629610812684e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3094 + }, + { + "completion_length": 1602.3333435058594, + "epoch": 0.4717987804878049, + "grad_norm": 0.11031930583710695, + "kl": 0.07568359375, + "learning_rate": 1.9043944029481578e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3095 + }, + { + "completion_length": 2650.0, + "epoch": 0.4719512195121951, + "grad_norm": 0.14122012505626877, + "kl": 0.083740234375, + "learning_rate": 1.9036257303132843e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3096 + }, + { + "completion_length": 1561.5000610351562, + "epoch": 0.47210365853658537, + "grad_norm": 0.10683283893160972, + "kl": 0.10107421875, + "learning_rate": 1.9028569433942922e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3097 + }, + { + "completion_length": 1632.666748046875, + "epoch": 0.4722560975609756, + "grad_norm": 0.09814086581244283, + "kl": 0.0740966796875, + "learning_rate": 1.9020880424088594e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3098 + }, + { + "completion_length": 906.8333435058594, + "epoch": 0.47240853658536586, + "grad_norm": 0.23257858013546626, + "kl": 0.171630859375, + "learning_rate": 1.9013190275746953e-06, + "loss": 0.0069, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3099 + }, + { + "completion_length": 2272.8334350585938, + "epoch": 0.4725609756097561, + "grad_norm": 0.11889401502070444, + "kl": 0.09033203125, + "learning_rate": 1.9005498991095422e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3100 + }, + { + "completion_length": 1480.1667175292969, + "epoch": 0.47271341463414634, + "grad_norm": 0.08946518253526027, + "kl": 0.0604248046875, + "learning_rate": 1.8997806572311727e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3101 + }, + { + "completion_length": 2000.5000610351562, + "epoch": 0.47286585365853656, + "grad_norm": 0.08552758161224995, + "kl": 0.0751953125, + "learning_rate": 1.8990113021573945e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3102 + }, + { + "completion_length": 1935.166748046875, + "epoch": 0.47301829268292683, + "grad_norm": 0.08784068012917173, + "kl": 0.087646484375, + "learning_rate": 1.898241834106044e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3103 + }, + { + "completion_length": 1893.5000610351562, + "epoch": 0.47317073170731705, + "grad_norm": 0.0860968955003907, + "kl": 0.08154296875, + "learning_rate": 1.8974722532949929e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3104 + }, + { + "completion_length": 1291.3333740234375, + "epoch": 0.4733231707317073, + "grad_norm": 0.10186768891548849, + "kl": 0.094970703125, + "learning_rate": 1.8967025599421419e-06, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3105 + }, + { + "completion_length": 970.8333740234375, + "epoch": 0.47347560975609754, + "grad_norm": 0.11353640671837505, + "kl": 0.099609375, + "learning_rate": 1.8959327542654258e-06, + "loss": 0.004, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3106 + }, + { + "completion_length": 1243.5000305175781, + "epoch": 0.4736280487804878, + "grad_norm": 0.12800950016522966, + "kl": 0.11962890625, + "learning_rate": 1.8951628364828096e-06, + "loss": 0.0048, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3107 + }, + { + "completion_length": 1426.3333435058594, + "epoch": 0.473780487804878, + "grad_norm": 0.14890877764067637, + "kl": 0.142333984375, + "learning_rate": 1.894392806812291e-06, + "loss": 0.0057, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3108 + }, + { + "completion_length": 1422.3333740234375, + "epoch": 0.4739329268292683, + "grad_norm": 0.12310470393559746, + "kl": 0.0947265625, + "learning_rate": 1.8936226654719e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3109 + }, + { + "completion_length": 1644.8333740234375, + "epoch": 0.4740853658536585, + "grad_norm": 0.10972248561153616, + "kl": 0.12158203125, + "learning_rate": 1.8928524126796962e-06, + "loss": 0.0049, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3110 + }, + { + "completion_length": 1966.8333740234375, + "epoch": 0.4742378048780488, + "grad_norm": 0.0827363019508248, + "kl": 0.08447265625, + "learning_rate": 1.8920820486537724e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3111 + }, + { + "completion_length": 1510.0, + "epoch": 0.474390243902439, + "grad_norm": 0.109766241853339, + "kl": 0.08935546875, + "learning_rate": 1.8913115736122519e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3112 + }, + { + "completion_length": 1682.0000610351562, + "epoch": 0.4745426829268293, + "grad_norm": 0.1367232418785774, + "kl": 0.088623046875, + "learning_rate": 1.8905409877732903e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3113 + }, + { + "completion_length": 774.1666870117188, + "epoch": 0.4746951219512195, + "grad_norm": 0.1490758290013717, + "kl": 0.105712890625, + "learning_rate": 1.8897702913550743e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3114 + }, + { + "completion_length": 1272.6666870117188, + "epoch": 0.47484756097560976, + "grad_norm": 0.07877690173830396, + "kl": 0.083984375, + "learning_rate": 1.8889994845758215e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3115 + }, + { + "completion_length": 2192.5001220703125, + "epoch": 0.475, + "grad_norm": 0.0817220936386667, + "kl": 0.0655517578125, + "learning_rate": 1.888228567653781e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3116 + }, + { + "completion_length": 3176.166748046875, + "epoch": 0.47515243902439025, + "grad_norm": 0.07256438251195935, + "kl": 0.0653076171875, + "learning_rate": 1.8874575408072337e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3117 + }, + { + "completion_length": 1387.666748046875, + "epoch": 0.47530487804878047, + "grad_norm": 0.18091667187542956, + "kl": 0.1318359375, + "learning_rate": 1.8866864042544907e-06, + "loss": 0.0053, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3118 + }, + { + "completion_length": 1301.3333740234375, + "epoch": 0.47545731707317074, + "grad_norm": 0.11743725159072942, + "kl": 0.069091796875, + "learning_rate": 1.8859151582138946e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3119 + }, + { + "completion_length": 2239.166748046875, + "epoch": 0.47560975609756095, + "grad_norm": 0.09071033694263168, + "kl": 0.078125, + "learning_rate": 1.8851438029038191e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3120 + }, + { + "completion_length": 1683.6666870117188, + "epoch": 0.4757621951219512, + "grad_norm": 0.13160916313087495, + "kl": 0.076416015625, + "learning_rate": 1.8843723385426677e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3121 + }, + { + "completion_length": 3161.666748046875, + "epoch": 0.47591463414634144, + "grad_norm": 0.0633137700730795, + "kl": 0.06689453125, + "learning_rate": 1.883600765348877e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3122 + }, + { + "completion_length": 2241.166748046875, + "epoch": 0.4760670731707317, + "grad_norm": 0.10880098725021436, + "kl": 0.0635986328125, + "learning_rate": 1.8828290835409124e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3123 + }, + { + "completion_length": 2115.666748046875, + "epoch": 0.47621951219512193, + "grad_norm": 0.11888825928575428, + "kl": 0.073974609375, + "learning_rate": 1.882057293337271e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3124 + }, + { + "completion_length": 1261.0000305175781, + "epoch": 0.4763719512195122, + "grad_norm": 0.14022982495716446, + "kl": 0.09130859375, + "learning_rate": 1.8812853949564805e-06, + "loss": 0.0037, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3125 + }, + { + "completion_length": 3399.5, + "epoch": 0.4765243902439024, + "grad_norm": 0.054218052520344506, + "kl": 0.0576171875, + "learning_rate": 1.8805133886170994e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3126 + }, + { + "completion_length": 1690.6667175292969, + "epoch": 0.4766768292682927, + "grad_norm": 0.10672692542250187, + "kl": 0.0679931640625, + "learning_rate": 1.8797412745377158e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3127 + }, + { + "completion_length": 1606.6667175292969, + "epoch": 0.4768292682926829, + "grad_norm": 0.1011318769374574, + "kl": 0.07470703125, + "learning_rate": 1.8789690529369492e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3128 + }, + { + "completion_length": 962.5000305175781, + "epoch": 0.4769817073170732, + "grad_norm": 0.17070831862730274, + "kl": 0.068603515625, + "learning_rate": 1.87819672403345e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3129 + }, + { + "completion_length": 2185.3334350585938, + "epoch": 0.4771341463414634, + "grad_norm": 0.12901620245868992, + "kl": 0.07275390625, + "learning_rate": 1.8774242880458974e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3130 + }, + { + "completion_length": 966.0000610351562, + "epoch": 0.47728658536585367, + "grad_norm": 0.11097031882861152, + "kl": 0.075927734375, + "learning_rate": 1.8766517451930027e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3131 + }, + { + "completion_length": 2579.1666870117188, + "epoch": 0.4774390243902439, + "grad_norm": 0.1260245965969011, + "kl": 0.069091796875, + "learning_rate": 1.8758790956935059e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3132 + }, + { + "completion_length": 2172.5000610351562, + "epoch": 0.47759146341463415, + "grad_norm": 0.07309588390736207, + "kl": 0.0584716796875, + "learning_rate": 1.875106339766178e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3133 + }, + { + "completion_length": 1227.1666870117188, + "epoch": 0.47774390243902437, + "grad_norm": 0.09043747606677502, + "kl": 0.074951171875, + "learning_rate": 1.8743334776298204e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3134 + }, + { + "completion_length": 1196.5000610351562, + "epoch": 0.47789634146341464, + "grad_norm": 0.09276889302912947, + "kl": 0.0740966796875, + "learning_rate": 1.8735605095032646e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3135 + }, + { + "completion_length": 2010.5, + "epoch": 0.47804878048780486, + "grad_norm": 0.09870937246316304, + "kl": 0.08154296875, + "learning_rate": 1.8727874356053706e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3136 + }, + { + "completion_length": 2323.166748046875, + "epoch": 0.47820121951219513, + "grad_norm": 0.5114762508512392, + "kl": 0.07666015625, + "learning_rate": 1.8720142561550302e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3137 + }, + { + "completion_length": 788.6666870117188, + "epoch": 0.47835365853658535, + "grad_norm": 0.20840963131907703, + "kl": 0.084716796875, + "learning_rate": 1.8712409713711646e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3138 + }, + { + "completion_length": 853.3333435058594, + "epoch": 0.4785060975609756, + "grad_norm": 0.08827654699026978, + "kl": 0.059326171875, + "learning_rate": 1.870467581472724e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3139 + }, + { + "completion_length": 888.8333740234375, + "epoch": 0.47865853658536583, + "grad_norm": 1.510525210360236, + "kl": 0.09423828125, + "learning_rate": 1.869694086678689e-06, + "loss": 0.0038, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3140 + }, + { + "completion_length": 740.8333435058594, + "epoch": 0.4788109756097561, + "grad_norm": 0.11461659256006118, + "kl": 0.08984375, + "learning_rate": 1.8689204872080702e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3141 + }, + { + "completion_length": 2500.0, + "epoch": 0.4789634146341463, + "grad_norm": 0.05537710527598891, + "kl": 0.07177734375, + "learning_rate": 1.8681467832799073e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3142 + }, + { + "completion_length": 1052.3333740234375, + "epoch": 0.4791158536585366, + "grad_norm": 0.1576204477150155, + "kl": 0.08203125, + "learning_rate": 1.867372975113269e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3143 + }, + { + "completion_length": 2691.666748046875, + "epoch": 0.4792682926829268, + "grad_norm": 0.14145422792688436, + "kl": 0.0557861328125, + "learning_rate": 1.8665990629272555e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3144 + }, + { + "completion_length": 1510.8333435058594, + "epoch": 0.4794207317073171, + "grad_norm": 0.08032177819087821, + "kl": 0.080078125, + "learning_rate": 1.865825046940995e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3145 + }, + { + "completion_length": 2332.5, + "epoch": 0.4795731707317073, + "grad_norm": 0.08145620689061643, + "kl": 0.06298828125, + "learning_rate": 1.8650509273736448e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3146 + }, + { + "completion_length": 1355.6666870117188, + "epoch": 0.47972560975609757, + "grad_norm": 0.11568398240185924, + "kl": 0.069091796875, + "learning_rate": 1.8642767044443923e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3147 + }, + { + "completion_length": 1087.6666870117188, + "epoch": 0.4798780487804878, + "grad_norm": 0.12112838959263111, + "kl": 0.098388671875, + "learning_rate": 1.863502378372454e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3148 + }, + { + "completion_length": 1161.1666870117188, + "epoch": 0.48003048780487806, + "grad_norm": 0.12072912936526364, + "kl": 0.0743408203125, + "learning_rate": 1.8627279493770758e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3149 + }, + { + "completion_length": 794.3333740234375, + "epoch": 0.4801829268292683, + "grad_norm": 0.11272237178965339, + "kl": 0.07763671875, + "learning_rate": 1.8619534176775315e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3150 + }, + { + "completion_length": 827.8333435058594, + "epoch": 0.48033536585365855, + "grad_norm": 0.10209631711982868, + "kl": 0.0703125, + "learning_rate": 1.861178783493126e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3151 + }, + { + "completion_length": 1286.666748046875, + "epoch": 0.48048780487804876, + "grad_norm": 0.15401988358026908, + "kl": 0.07861328125, + "learning_rate": 1.8604040470431908e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3152 + }, + { + "completion_length": 1681.6667175292969, + "epoch": 0.48064024390243903, + "grad_norm": 0.10719839412404211, + "kl": 0.0706787109375, + "learning_rate": 1.8596292085470897e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3153 + }, + { + "completion_length": 1227.0, + "epoch": 0.48079268292682925, + "grad_norm": 0.10948268312038748, + "kl": 0.078857421875, + "learning_rate": 1.858854268224212e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3154 + }, + { + "completion_length": 1405.8333740234375, + "epoch": 0.4809451219512195, + "grad_norm": 0.10847025830606907, + "kl": 0.07568359375, + "learning_rate": 1.8580792262939773e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3155 + }, + { + "completion_length": 1278.3333740234375, + "epoch": 0.48109756097560974, + "grad_norm": 0.07667222579730337, + "kl": 0.067626953125, + "learning_rate": 1.857304082975834e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3156 + }, + { + "completion_length": 820.0000305175781, + "epoch": 0.48125, + "grad_norm": 0.10738156121331857, + "kl": 0.062744140625, + "learning_rate": 1.8565288384892597e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3157 + }, + { + "completion_length": 984.5000610351562, + "epoch": 0.4814024390243902, + "grad_norm": 2.131625633116444, + "kl": 0.0782470703125, + "learning_rate": 1.8557534930537597e-06, + "loss": 0.0031, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 3158 + }, + { + "completion_length": 1540.8333740234375, + "epoch": 0.4815548780487805, + "grad_norm": 0.09200567391443633, + "kl": 0.068603515625, + "learning_rate": 1.854978046888868e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3159 + }, + { + "completion_length": 1654.5000915527344, + "epoch": 0.4817073170731707, + "grad_norm": 0.10801577350175433, + "kl": 0.06591796875, + "learning_rate": 1.8542025002141474e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3160 + }, + { + "completion_length": 1203.3333740234375, + "epoch": 0.481859756097561, + "grad_norm": 0.1377111896330318, + "kl": 0.10546875, + "learning_rate": 1.853426853249189e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3161 + }, + { + "completion_length": 3216.3333740234375, + "epoch": 0.4820121951219512, + "grad_norm": 0.054464727204399425, + "kl": 0.0517578125, + "learning_rate": 1.8526511062136132e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3162 + }, + { + "completion_length": 1448.8333740234375, + "epoch": 0.4821646341463415, + "grad_norm": 0.10017776019569614, + "kl": 0.05657958984375, + "learning_rate": 1.8518752593270673e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3163 + }, + { + "completion_length": 2023.5, + "epoch": 0.4823170731707317, + "grad_norm": 0.10817690126074404, + "kl": 0.0714111328125, + "learning_rate": 1.8510993128092273e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3164 + }, + { + "completion_length": 1389.3333740234375, + "epoch": 0.48246951219512196, + "grad_norm": 0.10689457705698924, + "kl": 0.07568359375, + "learning_rate": 1.850323266879799e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3165 + }, + { + "completion_length": 1131.5000610351562, + "epoch": 0.4826219512195122, + "grad_norm": 0.10091805550193411, + "kl": 0.07275390625, + "learning_rate": 1.8495471217585131e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3166 + }, + { + "completion_length": 1424.0000610351562, + "epoch": 0.48277439024390245, + "grad_norm": 0.16681139009185517, + "kl": 0.065185546875, + "learning_rate": 1.8487708776651317e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3167 + }, + { + "completion_length": 1744.6666870117188, + "epoch": 0.48292682926829267, + "grad_norm": 0.05671633859892674, + "kl": 0.0452880859375, + "learning_rate": 1.8479945348194423e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3168 + }, + { + "completion_length": 1268.3333740234375, + "epoch": 0.48307926829268294, + "grad_norm": 0.11650705694606131, + "kl": 0.0751953125, + "learning_rate": 1.8472180934412626e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3169 + }, + { + "completion_length": 1351.0, + "epoch": 0.48323170731707316, + "grad_norm": 0.12125419835292375, + "kl": 0.0521240234375, + "learning_rate": 1.8464415537504363e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3170 + }, + { + "completion_length": 2645.6666870117188, + "epoch": 0.4833841463414634, + "grad_norm": 0.09048019032550725, + "kl": 0.074951171875, + "learning_rate": 1.845664915966837e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3171 + }, + { + "completion_length": 1874.0, + "epoch": 0.48353658536585364, + "grad_norm": 0.48970837076806306, + "kl": 0.0662841796875, + "learning_rate": 1.8448881803103637e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3172 + }, + { + "completion_length": 1376.6666870117188, + "epoch": 0.4836890243902439, + "grad_norm": 0.07128063162099452, + "kl": 0.0633544921875, + "learning_rate": 1.8441113470009447e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3173 + }, + { + "completion_length": 2656.5, + "epoch": 0.48384146341463413, + "grad_norm": 0.17255135138455363, + "kl": 0.0577392578125, + "learning_rate": 1.8433344162585355e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3174 + }, + { + "completion_length": 1211.0, + "epoch": 0.4839939024390244, + "grad_norm": 0.11072578003195316, + "kl": 0.072021484375, + "learning_rate": 1.8425573883031192e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3175 + }, + { + "completion_length": 1545.3333740234375, + "epoch": 0.4841463414634146, + "grad_norm": 0.11004532148059133, + "kl": 0.074951171875, + "learning_rate": 1.8417802633547067e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3176 + }, + { + "completion_length": 2551.3333740234375, + "epoch": 0.4842987804878049, + "grad_norm": 0.8396212530700559, + "kl": 0.055908203125, + "learning_rate": 1.8410030416333354e-06, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3177 + }, + { + "completion_length": 1411.166748046875, + "epoch": 0.4844512195121951, + "grad_norm": 0.11304547276493537, + "kl": 0.076416015625, + "learning_rate": 1.8402257233590717e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3178 + }, + { + "completion_length": 698.8333435058594, + "epoch": 0.4846036585365854, + "grad_norm": 0.21024977587586652, + "kl": 0.114990234375, + "learning_rate": 1.8394483087520077e-06, + "loss": 0.0046, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3179 + }, + { + "completion_length": 603.0000305175781, + "epoch": 0.4847560975609756, + "grad_norm": 0.39492206092908794, + "kl": 0.08935546875, + "learning_rate": 1.8386707980322637e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3180 + }, + { + "completion_length": 1076.0000305175781, + "epoch": 0.48490853658536587, + "grad_norm": 0.09330556784776269, + "kl": 0.062744140625, + "learning_rate": 1.8378931914199873e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3181 + }, + { + "completion_length": 759.6666870117188, + "epoch": 0.4850609756097561, + "grad_norm": 0.10723672563188659, + "kl": 0.07861328125, + "learning_rate": 1.8371154891353532e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3182 + }, + { + "completion_length": 809.8333740234375, + "epoch": 0.48521341463414636, + "grad_norm": 0.13046345946919313, + "kl": 0.073486328125, + "learning_rate": 1.8363376913985622e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3183 + }, + { + "completion_length": 954.5000305175781, + "epoch": 0.4853658536585366, + "grad_norm": 0.1342647180816836, + "kl": 0.101806640625, + "learning_rate": 1.8355597984298435e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3184 + }, + { + "completion_length": 998.0, + "epoch": 0.48551829268292684, + "grad_norm": 0.14016044698754132, + "kl": 0.08740234375, + "learning_rate": 1.8347818104494523e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3185 + }, + { + "completion_length": 688.3333435058594, + "epoch": 0.48567073170731706, + "grad_norm": 0.07959248975552476, + "kl": 0.0546875, + "learning_rate": 1.8340037276776715e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3186 + }, + { + "completion_length": 1110.5000610351562, + "epoch": 0.48582317073170733, + "grad_norm": 0.10540649832431984, + "kl": 0.09326171875, + "learning_rate": 1.83322555033481e-06, + "loss": 0.0037, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3187 + }, + { + "completion_length": 990.5000305175781, + "epoch": 0.48597560975609755, + "grad_norm": 0.1451721888550964, + "kl": 0.083984375, + "learning_rate": 1.8324472786412037e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3188 + }, + { + "completion_length": 690.0000305175781, + "epoch": 0.4861280487804878, + "grad_norm": 0.14035992361889527, + "kl": 0.09619140625, + "learning_rate": 1.8316689128172158e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3189 + }, + { + "completion_length": 1064.6667175292969, + "epoch": 0.48628048780487804, + "grad_norm": 0.08797317693865017, + "kl": 0.0703125, + "learning_rate": 1.8308904530832357e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3190 + }, + { + "completion_length": 1159.8333740234375, + "epoch": 0.4864329268292683, + "grad_norm": 0.1114766868989025, + "kl": 0.07763671875, + "learning_rate": 1.83011189965968e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3191 + }, + { + "completion_length": 1163.6667175292969, + "epoch": 0.4865853658536585, + "grad_norm": 0.1140773284468928, + "kl": 0.089111328125, + "learning_rate": 1.8293332527669897e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3192 + }, + { + "completion_length": 1790.666748046875, + "epoch": 0.4867378048780488, + "grad_norm": 0.08893116009968861, + "kl": 0.068115234375, + "learning_rate": 1.8285545126256354e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3193 + }, + { + "completion_length": 873.5000305175781, + "epoch": 0.486890243902439, + "grad_norm": 0.14529041849508523, + "kl": 0.0897216796875, + "learning_rate": 1.8277756794561119e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3194 + }, + { + "completion_length": 1512.5000610351562, + "epoch": 0.4870426829268293, + "grad_norm": 1.8429289700180786, + "kl": 0.09619140625, + "learning_rate": 1.8269967534789406e-06, + "loss": 0.0039, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3195 + }, + { + "completion_length": 1097.3333740234375, + "epoch": 0.4871951219512195, + "grad_norm": 0.21689754371744566, + "kl": 0.112060546875, + "learning_rate": 1.8262177349146702e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3196 + }, + { + "completion_length": 2218.5001220703125, + "epoch": 0.4873475609756098, + "grad_norm": 1.171946842802859, + "kl": 0.07421875, + "learning_rate": 1.8254386239838747e-06, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3197 + }, + { + "completion_length": 1882.166748046875, + "epoch": 0.4875, + "grad_norm": 0.5221418492682366, + "kl": 0.106689453125, + "learning_rate": 1.8246594209071543e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3198 + }, + { + "completion_length": 2670.6666870117188, + "epoch": 0.48765243902439026, + "grad_norm": 1.7617645108292839, + "kl": 0.061767578125, + "learning_rate": 1.8238801259051358e-06, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3199 + }, + { + "completion_length": 3162.0001220703125, + "epoch": 0.4878048780487805, + "grad_norm": 0.06038658526153771, + "kl": 0.050048828125, + "learning_rate": 1.823100739198472e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3200 + }, + { + "completion_length": 3208.0, + "epoch": 0.48795731707317075, + "grad_norm": 0.14187339793904502, + "kl": 0.057861328125, + "learning_rate": 1.8223212610078408e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3201 + }, + { + "completion_length": 2880.5, + "epoch": 0.48810975609756097, + "grad_norm": 0.6708392182102639, + "kl": 0.0565185546875, + "learning_rate": 1.821541691553947e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3202 + }, + { + "completion_length": 2809.8333740234375, + "epoch": 0.48826219512195124, + "grad_norm": 0.1233937526594862, + "kl": 0.0576171875, + "learning_rate": 1.820762031057521e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3203 + }, + { + "completion_length": 2265.3333740234375, + "epoch": 0.48841463414634145, + "grad_norm": 0.0857044891670376, + "kl": 0.067138671875, + "learning_rate": 1.8199822797393182e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3204 + }, + { + "completion_length": 4085.5, + "epoch": 0.4885670731707317, + "grad_norm": 0.047126930275128605, + "kl": 0.0440673828125, + "learning_rate": 1.819202437820121e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3205 + }, + { + "completion_length": 935.5, + "epoch": 0.48871951219512194, + "grad_norm": 0.09924726036867655, + "kl": 0.0479736328125, + "learning_rate": 1.818422505520736e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3206 + }, + { + "completion_length": 3564.5001220703125, + "epoch": 0.4888719512195122, + "grad_norm": 0.04334458738812254, + "kl": 0.04150390625, + "learning_rate": 1.817642483061997e-06, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3207 + }, + { + "completion_length": 2756.166748046875, + "epoch": 0.48902439024390243, + "grad_norm": 0.7206795102795518, + "kl": 0.0660400390625, + "learning_rate": 1.816862370664762e-06, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3208 + }, + { + "completion_length": 2674.5001220703125, + "epoch": 0.4891768292682927, + "grad_norm": 0.05484508190169477, + "kl": 0.0482177734375, + "learning_rate": 1.8160821685499158e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3209 + }, + { + "completion_length": 4028.3333740234375, + "epoch": 0.4893292682926829, + "grad_norm": 0.037543011938185926, + "kl": 0.043701171875, + "learning_rate": 1.8153018769383664e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3210 + }, + { + "completion_length": 2565.666748046875, + "epoch": 0.4894817073170732, + "grad_norm": 0.05597731901850608, + "kl": 0.05615234375, + "learning_rate": 1.8145214960510496e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3211 + }, + { + "completion_length": 2556.8333740234375, + "epoch": 0.4896341463414634, + "grad_norm": 0.12495787771324642, + "kl": 0.06982421875, + "learning_rate": 1.8137410261089253e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3212 + }, + { + "completion_length": 2279.3333740234375, + "epoch": 0.4897865853658537, + "grad_norm": 0.10355632783331548, + "kl": 0.0550537109375, + "learning_rate": 1.8129604673329783e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3213 + }, + { + "completion_length": 3870.0, + "epoch": 0.4899390243902439, + "grad_norm": 0.038017384246243804, + "kl": 0.0487060546875, + "learning_rate": 1.8121798199442191e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3214 + }, + { + "completion_length": 2927.1666870117188, + "epoch": 0.49009146341463417, + "grad_norm": 0.0637059440533884, + "kl": 0.0577392578125, + "learning_rate": 1.8113990841636832e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3215 + }, + { + "completion_length": 2269.8334350585938, + "epoch": 0.4902439024390244, + "grad_norm": 0.06659924686735517, + "kl": 0.0662841796875, + "learning_rate": 1.8106182602124312e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3216 + }, + { + "completion_length": 932.8333435058594, + "epoch": 0.49039634146341465, + "grad_norm": 0.08747350467916451, + "kl": 0.060302734375, + "learning_rate": 1.8098373483115484e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3217 + }, + { + "completion_length": 1365.5, + "epoch": 0.49054878048780487, + "grad_norm": 1.6822738331235476, + "kl": 0.05810546875, + "learning_rate": 1.8090563486821453e-06, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3218 + }, + { + "completion_length": 2469.666748046875, + "epoch": 0.49070121951219514, + "grad_norm": 0.07488487580518934, + "kl": 0.0689697265625, + "learning_rate": 1.808275261545357e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3219 + }, + { + "completion_length": 1047.1666870117188, + "epoch": 0.49085365853658536, + "grad_norm": 0.12213014299031165, + "kl": 0.07763671875, + "learning_rate": 1.8074940871223436e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3220 + }, + { + "completion_length": 3054.3333740234375, + "epoch": 0.49100609756097563, + "grad_norm": 0.05504076486846787, + "kl": 0.055419921875, + "learning_rate": 1.8067128256342894e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3221 + }, + { + "completion_length": 3778.0001220703125, + "epoch": 0.49115853658536585, + "grad_norm": 0.056413939174274, + "kl": 0.0491943359375, + "learning_rate": 1.8059314773024042e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3222 + }, + { + "completion_length": 2755.0, + "epoch": 0.4913109756097561, + "grad_norm": 0.06730406998655822, + "kl": 0.0611572265625, + "learning_rate": 1.8051500423479219e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3223 + }, + { + "completion_length": 2984.666748046875, + "epoch": 0.49146341463414633, + "grad_norm": 0.10747980223986552, + "kl": 0.0595703125, + "learning_rate": 1.8043685209921002e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3224 + }, + { + "completion_length": 2286.3334350585938, + "epoch": 0.4916158536585366, + "grad_norm": 0.15615833545175356, + "kl": 0.083740234375, + "learning_rate": 1.8035869134562232e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3225 + }, + { + "completion_length": 1563.0, + "epoch": 0.4917682926829268, + "grad_norm": 0.16945860599204085, + "kl": 0.0859375, + "learning_rate": 1.802805219961597e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3226 + }, + { + "completion_length": 2396.5000610351562, + "epoch": 0.4919207317073171, + "grad_norm": 0.07951240134940722, + "kl": 0.0660400390625, + "learning_rate": 1.8020234407295545e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3227 + }, + { + "completion_length": 2856.5, + "epoch": 0.4920731707317073, + "grad_norm": 0.7176798206266455, + "kl": 0.056884765625, + "learning_rate": 1.8012415759814505e-06, + "loss": 0.0023, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 3228 + }, + { + "completion_length": 3232.166748046875, + "epoch": 0.4922256097560976, + "grad_norm": 0.12404587787365388, + "kl": 0.069091796875, + "learning_rate": 1.8004596259386662e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3229 + }, + { + "completion_length": 2900.83349609375, + "epoch": 0.4923780487804878, + "grad_norm": 0.04124028333276008, + "kl": 0.051513671875, + "learning_rate": 1.799677590822605e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3230 + }, + { + "completion_length": 1916.666748046875, + "epoch": 0.49253048780487807, + "grad_norm": 1.5011883321123, + "kl": 0.092041015625, + "learning_rate": 1.7988954708546956e-06, + "loss": 0.0037, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3231 + }, + { + "completion_length": 1534.3334350585938, + "epoch": 0.4926829268292683, + "grad_norm": 1.5483052361084304, + "kl": 0.08447265625, + "learning_rate": 1.7981132662563906e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3232 + }, + { + "completion_length": 2748.0001220703125, + "epoch": 0.49283536585365856, + "grad_norm": 0.050055629941796374, + "kl": 0.053955078125, + "learning_rate": 1.7973309772491661e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3233 + }, + { + "completion_length": 2089.666717529297, + "epoch": 0.4929878048780488, + "grad_norm": 1.4961596920434974, + "kl": 0.076904296875, + "learning_rate": 1.7965486040545224e-06, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3234 + }, + { + "completion_length": 1792.3333740234375, + "epoch": 0.49314024390243905, + "grad_norm": 0.10342227541680807, + "kl": 0.0538330078125, + "learning_rate": 1.7957661468939836e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3235 + }, + { + "completion_length": 2241.0, + "epoch": 0.49329268292682926, + "grad_norm": 0.08162371638458911, + "kl": 0.0560302734375, + "learning_rate": 1.794983605989098e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3236 + }, + { + "completion_length": 2140.666748046875, + "epoch": 0.49344512195121953, + "grad_norm": 0.08609493983666414, + "kl": 0.0736083984375, + "learning_rate": 1.7942009815614367e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3237 + }, + { + "completion_length": 950.1666870117188, + "epoch": 0.49359756097560975, + "grad_norm": 0.16359948616871026, + "kl": 0.092529296875, + "learning_rate": 1.7934182738325954e-06, + "loss": 0.0037, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3238 + }, + { + "completion_length": 2722.5, + "epoch": 0.49375, + "grad_norm": 0.05562515660786532, + "kl": 0.062255859375, + "learning_rate": 1.7926354830241926e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3239 + }, + { + "completion_length": 2433.166748046875, + "epoch": 0.49390243902439024, + "grad_norm": 0.12743459280143146, + "kl": 0.07177734375, + "learning_rate": 1.7918526093578702e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3240 + }, + { + "completion_length": 1529.6666870117188, + "epoch": 0.4940548780487805, + "grad_norm": 1.6085715369744231, + "kl": 0.074462890625, + "learning_rate": 1.7910696530552954e-06, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3241 + }, + { + "completion_length": 1675.666748046875, + "epoch": 0.4942073170731707, + "grad_norm": 0.11766237873801369, + "kl": 0.0635986328125, + "learning_rate": 1.7902866143381559e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3242 + }, + { + "completion_length": 1372.3333740234375, + "epoch": 0.494359756097561, + "grad_norm": 0.1404395625109093, + "kl": 0.0751953125, + "learning_rate": 1.7895034934281653e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3243 + }, + { + "completion_length": 2266.8333740234375, + "epoch": 0.4945121951219512, + "grad_norm": 0.08077885791783428, + "kl": 0.0638427734375, + "learning_rate": 1.7887202905470582e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3244 + }, + { + "completion_length": 1889.8334350585938, + "epoch": 0.4946646341463415, + "grad_norm": 0.08129934741090165, + "kl": 0.0633544921875, + "learning_rate": 1.7879370059165955e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3245 + }, + { + "completion_length": 2566.8334350585938, + "epoch": 0.4948170731707317, + "grad_norm": 0.1360949794328178, + "kl": 0.0718994140625, + "learning_rate": 1.7871536397585573e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3246 + }, + { + "completion_length": 2514.0000610351562, + "epoch": 0.494969512195122, + "grad_norm": 0.08259707032833165, + "kl": 0.074951171875, + "learning_rate": 1.7863701922947508e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3247 + }, + { + "completion_length": 2246.1666870117188, + "epoch": 0.4951219512195122, + "grad_norm": 1.2795706556014952, + "kl": 0.083984375, + "learning_rate": 1.7855866637470027e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3248 + }, + { + "completion_length": 2642.0000610351562, + "epoch": 0.49527439024390246, + "grad_norm": 0.744062568269984, + "kl": 0.0784912109375, + "learning_rate": 1.7848030543371648e-06, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3249 + }, + { + "completion_length": 2217.3333740234375, + "epoch": 0.4954268292682927, + "grad_norm": 0.0664476624492436, + "kl": 0.05810546875, + "learning_rate": 1.784019364287112e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3250 + }, + { + "completion_length": 2894.3333740234375, + "epoch": 0.49557926829268295, + "grad_norm": 0.24591664135415592, + "kl": 0.0716552734375, + "learning_rate": 1.7832355938187403e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3251 + }, + { + "completion_length": 3533.0001220703125, + "epoch": 0.49573170731707317, + "grad_norm": 0.06689418043527091, + "kl": 0.0555419921875, + "learning_rate": 1.7824517431539697e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3252 + }, + { + "completion_length": 1670.0, + "epoch": 0.49588414634146344, + "grad_norm": 0.09915228802837044, + "kl": 0.0682373046875, + "learning_rate": 1.7816678125147425e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3253 + }, + { + "completion_length": 2308.0001220703125, + "epoch": 0.49603658536585366, + "grad_norm": 0.8651353176934764, + "kl": 0.061767578125, + "learning_rate": 1.7808838021230244e-06, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3254 + }, + { + "completion_length": 2278.1666717529297, + "epoch": 0.4961890243902439, + "grad_norm": 0.11919484882252054, + "kl": 0.069091796875, + "learning_rate": 1.780099712200802e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3255 + }, + { + "completion_length": 2710.3333740234375, + "epoch": 0.49634146341463414, + "grad_norm": 0.07474441139852392, + "kl": 0.0799560546875, + "learning_rate": 1.7793155429700868e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3256 + }, + { + "completion_length": 2146.5001220703125, + "epoch": 0.4964939024390244, + "grad_norm": 0.07067749608783254, + "kl": 0.074462890625, + "learning_rate": 1.7785312946529108e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3257 + }, + { + "completion_length": 1187.8333740234375, + "epoch": 0.49664634146341463, + "grad_norm": 0.2598660746204819, + "kl": 0.095947265625, + "learning_rate": 1.7777469674713287e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3258 + }, + { + "completion_length": 2485.0, + "epoch": 0.4967987804878049, + "grad_norm": 0.18518608239121037, + "kl": 0.0869140625, + "learning_rate": 1.7769625616474185e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3259 + }, + { + "completion_length": 1838.5001220703125, + "epoch": 0.4969512195121951, + "grad_norm": 0.06953604521078587, + "kl": 0.0609130859375, + "learning_rate": 1.776178077403279e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3260 + }, + { + "completion_length": 1320.6666870117188, + "epoch": 0.4971036585365854, + "grad_norm": 0.09500513551380571, + "kl": 0.07568359375, + "learning_rate": 1.7753935149610332e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3261 + }, + { + "completion_length": 1364.3333740234375, + "epoch": 0.4972560975609756, + "grad_norm": 0.08402587738873694, + "kl": 0.0714111328125, + "learning_rate": 1.7746088745428242e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3262 + }, + { + "completion_length": 1055.0000610351562, + "epoch": 0.4974085365853659, + "grad_norm": 0.09415794360871245, + "kl": 0.08154296875, + "learning_rate": 1.7738241563708181e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3263 + }, + { + "completion_length": 1774.8334350585938, + "epoch": 0.4975609756097561, + "grad_norm": 0.07753621831339777, + "kl": 0.0675048828125, + "learning_rate": 1.7730393606672033e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3264 + }, + { + "completion_length": 1861.0000610351562, + "epoch": 0.49771341463414637, + "grad_norm": 0.06074879161406488, + "kl": 0.0634765625, + "learning_rate": 1.77225448765419e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3265 + }, + { + "completion_length": 1818.3333740234375, + "epoch": 0.4978658536585366, + "grad_norm": 0.08696164082828786, + "kl": 0.075439453125, + "learning_rate": 1.7714695375540093e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3266 + }, + { + "completion_length": 1656.8333740234375, + "epoch": 0.49801829268292686, + "grad_norm": 1.0069551204065395, + "kl": 0.0662841796875, + "learning_rate": 1.7706845105889161e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3267 + }, + { + "completion_length": 1229.1666870117188, + "epoch": 0.49817073170731707, + "grad_norm": 0.3068727106308957, + "kl": 0.12353515625, + "learning_rate": 1.769899406981185e-06, + "loss": 0.0049, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3268 + }, + { + "completion_length": 3082.0, + "epoch": 0.49832317073170734, + "grad_norm": 0.9753652933454673, + "kl": 0.0640869140625, + "learning_rate": 1.7691142269531133e-06, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3269 + }, + { + "completion_length": 1091.5000305175781, + "epoch": 0.49847560975609756, + "grad_norm": 1.321372246282119, + "kl": 0.084716796875, + "learning_rate": 1.7683289707270206e-06, + "loss": 0.0034, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 3270 + }, + { + "completion_length": 880.0, + "epoch": 0.49862804878048783, + "grad_norm": 0.10801011846934402, + "kl": 0.0743408203125, + "learning_rate": 1.767543638525246e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3271 + }, + { + "completion_length": 1841.8333435058594, + "epoch": 0.49878048780487805, + "grad_norm": 0.09104350137630639, + "kl": 0.0703125, + "learning_rate": 1.7667582305701528e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3272 + }, + { + "completion_length": 790.0000305175781, + "epoch": 0.4989329268292683, + "grad_norm": 0.10978031123489441, + "kl": 0.084228515625, + "learning_rate": 1.7659727470841233e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3273 + }, + { + "completion_length": 1663.0000610351562, + "epoch": 0.49908536585365854, + "grad_norm": 1.0874481056291323, + "kl": 0.08544921875, + "learning_rate": 1.7651871882895633e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3274 + }, + { + "completion_length": 1262.0000610351562, + "epoch": 0.4992378048780488, + "grad_norm": 0.11653333457355498, + "kl": 0.111328125, + "learning_rate": 1.7644015544088979e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3275 + }, + { + "completion_length": 2987.33349609375, + "epoch": 0.499390243902439, + "grad_norm": 0.1387413716862753, + "kl": 0.077880859375, + "learning_rate": 1.7636158456645754e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3276 + }, + { + "completion_length": 1864.166748046875, + "epoch": 0.4995426829268293, + "grad_norm": 0.14253959023968657, + "kl": 0.103759765625, + "learning_rate": 1.7628300622790636e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3277 + }, + { + "completion_length": 1015.8333740234375, + "epoch": 0.4996951219512195, + "grad_norm": 0.13974683916804842, + "kl": 0.120361328125, + "learning_rate": 1.762044204474852e-06, + "loss": 0.0048, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3278 + }, + { + "completion_length": 2108.3333740234375, + "epoch": 0.4998475609756098, + "grad_norm": 1.2229143183635895, + "kl": 0.086669921875, + "learning_rate": 1.7612582724744524e-06, + "loss": 0.0035, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3279 + }, + { + "completion_length": 558.3333587646484, + "epoch": 0.5, + "grad_norm": 3.2621934322283614, + "kl": 0.12451171875, + "learning_rate": 1.7604722665003958e-06, + "loss": 0.005, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3280 + }, + { + "completion_length": 1373.8333740234375, + "epoch": 0.5001524390243902, + "grad_norm": 0.2168511748146793, + "kl": 0.092529296875, + "learning_rate": 1.7596861867752348e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3281 + }, + { + "completion_length": 1775.0000610351562, + "epoch": 0.5003048780487804, + "grad_norm": 0.13693881817414394, + "kl": 0.102783203125, + "learning_rate": 1.7589000335215434e-06, + "loss": 0.0041, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3282 + }, + { + "completion_length": 820.8333435058594, + "epoch": 0.5004573170731708, + "grad_norm": 0.19004829595863867, + "kl": 0.128173828125, + "learning_rate": 1.758113806961916e-06, + "loss": 0.0051, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3283 + }, + { + "completion_length": 1272.5000610351562, + "epoch": 0.500609756097561, + "grad_norm": 0.13391134878621472, + "kl": 0.106201171875, + "learning_rate": 1.7573275073189677e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3284 + }, + { + "completion_length": 1014.5, + "epoch": 0.5007621951219512, + "grad_norm": 0.23078792788759417, + "kl": 0.113037109375, + "learning_rate": 1.756541134815334e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3285 + }, + { + "completion_length": 1828.166748046875, + "epoch": 0.5009146341463414, + "grad_norm": 0.10558879389298056, + "kl": 0.0849609375, + "learning_rate": 1.7557546896736718e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3286 + }, + { + "completion_length": 1743.3333740234375, + "epoch": 0.5010670731707317, + "grad_norm": 0.10601626317178824, + "kl": 0.09326171875, + "learning_rate": 1.7549681721166581e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3287 + }, + { + "completion_length": 734.5000305175781, + "epoch": 0.501219512195122, + "grad_norm": 1.7692601964218124, + "kl": 0.10205078125, + "learning_rate": 1.7541815823669903e-06, + "loss": 0.0041, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3288 + }, + { + "completion_length": 560.3333435058594, + "epoch": 0.5013719512195122, + "grad_norm": 0.1299206821183147, + "kl": 0.087646484375, + "learning_rate": 1.7533949206473867e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3289 + }, + { + "completion_length": 828.8333435058594, + "epoch": 0.5015243902439024, + "grad_norm": 0.19087197064925474, + "kl": 0.08984375, + "learning_rate": 1.752608187180585e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3290 + }, + { + "completion_length": 1116.8333740234375, + "epoch": 0.5016768292682927, + "grad_norm": 0.1115741470128612, + "kl": 0.086669921875, + "learning_rate": 1.7518213821893446e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3291 + }, + { + "completion_length": 763.0000305175781, + "epoch": 0.5018292682926829, + "grad_norm": 2.113236167872905, + "kl": 0.0986328125, + "learning_rate": 1.7510345058964446e-06, + "loss": 0.0039, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3292 + }, + { + "completion_length": 1760.0001220703125, + "epoch": 0.5019817073170731, + "grad_norm": 1.485931789991304, + "kl": 0.086181640625, + "learning_rate": 1.7502475585246833e-06, + "loss": 0.0034, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3293 + }, + { + "completion_length": 1001.5, + "epoch": 0.5021341463414634, + "grad_norm": 0.09972361298087111, + "kl": 0.07080078125, + "learning_rate": 1.7494605402968805e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3294 + }, + { + "completion_length": 1050.8333740234375, + "epoch": 0.5022865853658537, + "grad_norm": 0.33028193873898376, + "kl": 0.089599609375, + "learning_rate": 1.7486734514358756e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3295 + }, + { + "completion_length": 1155.6667175292969, + "epoch": 0.5024390243902439, + "grad_norm": 0.1263421720812801, + "kl": 0.0771484375, + "learning_rate": 1.7478862921645273e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3296 + }, + { + "completion_length": 1154.5000610351562, + "epoch": 0.5025914634146341, + "grad_norm": 0.11897036650758328, + "kl": 0.105712890625, + "learning_rate": 1.747099062705716e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3297 + }, + { + "completion_length": 906.5000305175781, + "epoch": 0.5027439024390243, + "grad_norm": 0.1373896069233303, + "kl": 0.0633544921875, + "learning_rate": 1.7463117632823397e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3298 + }, + { + "completion_length": 910.5000305175781, + "epoch": 0.5028963414634147, + "grad_norm": 0.28995498780750856, + "kl": 0.101318359375, + "learning_rate": 1.7455243941173177e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3299 + }, + { + "completion_length": 778.8333740234375, + "epoch": 0.5030487804878049, + "grad_norm": 0.1330886769315491, + "kl": 0.12353515625, + "learning_rate": 1.7447369554335887e-06, + "loss": 0.005, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3300 + }, + { + "completion_length": 1146.3333740234375, + "epoch": 0.5032012195121951, + "grad_norm": 0.27373904730171855, + "kl": 0.1142578125, + "learning_rate": 1.7439494474541118e-06, + "loss": 0.0046, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3301 + }, + { + "completion_length": 823.1666870117188, + "epoch": 0.5033536585365853, + "grad_norm": 0.19139391820567028, + "kl": 0.119873046875, + "learning_rate": 1.7431618704018636e-06, + "loss": 0.0048, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3302 + }, + { + "completion_length": 586.8333435058594, + "epoch": 0.5035060975609756, + "grad_norm": 0.21420524141914285, + "kl": 0.1484375, + "learning_rate": 1.7423742244998432e-06, + "loss": 0.0059, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3303 + }, + { + "completion_length": 1264.0000610351562, + "epoch": 0.5036585365853659, + "grad_norm": 0.12353600570215585, + "kl": 0.115966796875, + "learning_rate": 1.7415865099710657e-06, + "loss": 0.0046, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3304 + }, + { + "completion_length": 901.3333435058594, + "epoch": 0.5038109756097561, + "grad_norm": 1.8653579239988376, + "kl": 0.126708984375, + "learning_rate": 1.7407987270385693e-06, + "loss": 0.0051, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3305 + }, + { + "completion_length": 538.3333587646484, + "epoch": 0.5039634146341463, + "grad_norm": 0.20818665783606896, + "kl": 0.109375, + "learning_rate": 1.7400108759254096e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3306 + }, + { + "completion_length": 1273.6667175292969, + "epoch": 0.5041158536585366, + "grad_norm": 0.13188672144207828, + "kl": 0.087158203125, + "learning_rate": 1.739222956854661e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3307 + }, + { + "completion_length": 776.1666870117188, + "epoch": 0.5042682926829268, + "grad_norm": 0.16793118672606033, + "kl": 0.107421875, + "learning_rate": 1.7384349700494184e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3308 + }, + { + "completion_length": 1467.5000610351562, + "epoch": 0.504420731707317, + "grad_norm": 0.10837840585820725, + "kl": 0.1015625, + "learning_rate": 1.7376469157327946e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3309 + }, + { + "completion_length": 702.8333435058594, + "epoch": 0.5045731707317073, + "grad_norm": 0.12833610524765035, + "kl": 0.092041015625, + "learning_rate": 1.7368587941279239e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3310 + }, + { + "completion_length": 1462.3333435058594, + "epoch": 0.5047256097560976, + "grad_norm": 0.09937274619199239, + "kl": 0.075927734375, + "learning_rate": 1.7360706054579566e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3311 + }, + { + "completion_length": 967.1667175292969, + "epoch": 0.5048780487804878, + "grad_norm": 2.082915067290899, + "kl": 0.116943359375, + "learning_rate": 1.735282349946064e-06, + "loss": 0.0047, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3312 + }, + { + "completion_length": 1273.8333740234375, + "epoch": 0.505030487804878, + "grad_norm": 1.3124600416049865, + "kl": 0.089599609375, + "learning_rate": 1.7344940278154356e-06, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3313 + }, + { + "completion_length": 517.5, + "epoch": 0.5051829268292682, + "grad_norm": 0.17363701221889108, + "kl": 0.08984375, + "learning_rate": 1.7337056392892802e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3314 + }, + { + "completion_length": 3271.0001220703125, + "epoch": 0.5053353658536586, + "grad_norm": 0.07208536860179317, + "kl": 0.076416015625, + "learning_rate": 1.7329171845908248e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3315 + }, + { + "completion_length": 1918.6666870117188, + "epoch": 0.5054878048780488, + "grad_norm": 0.10167378497611576, + "kl": 0.089111328125, + "learning_rate": 1.732128663943315e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3316 + }, + { + "completion_length": 1949.166748046875, + "epoch": 0.505640243902439, + "grad_norm": 0.14564504540630024, + "kl": 0.0908203125, + "learning_rate": 1.7313400775700166e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3317 + }, + { + "completion_length": 2415.6666870117188, + "epoch": 0.5057926829268292, + "grad_norm": 0.13173799713652254, + "kl": 0.0845947265625, + "learning_rate": 1.730551425694212e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3318 + }, + { + "completion_length": 2607.0, + "epoch": 0.5059451219512195, + "grad_norm": 0.06956826365678076, + "kl": 0.069580078125, + "learning_rate": 1.7297627085392038e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3319 + }, + { + "completion_length": 2588.5001220703125, + "epoch": 0.5060975609756098, + "grad_norm": 0.05969190046527715, + "kl": 0.06787109375, + "learning_rate": 1.7289739263283118e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3320 + }, + { + "completion_length": 1737.0, + "epoch": 0.50625, + "grad_norm": 0.13440497254252348, + "kl": 0.083740234375, + "learning_rate": 1.7281850792848752e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3321 + }, + { + "completion_length": 3190.166748046875, + "epoch": 0.5064024390243902, + "grad_norm": 0.07152830822083035, + "kl": 0.072021484375, + "learning_rate": 1.7273961676322507e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3322 + }, + { + "completion_length": 3210.5001220703125, + "epoch": 0.5065548780487805, + "grad_norm": 0.05232790931053159, + "kl": 0.06005859375, + "learning_rate": 1.7266071915938146e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3323 + }, + { + "completion_length": 3521.3333740234375, + "epoch": 0.5067073170731707, + "grad_norm": 0.04723655536559781, + "kl": 0.0567626953125, + "learning_rate": 1.7258181513929593e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3324 + }, + { + "completion_length": 2319.8333740234375, + "epoch": 0.506859756097561, + "grad_norm": 0.06040106883706661, + "kl": 0.052001953125, + "learning_rate": 1.7250290472530975e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3325 + }, + { + "completion_length": 2560.8333740234375, + "epoch": 0.5070121951219512, + "grad_norm": 0.5989076696873277, + "kl": 0.0850830078125, + "learning_rate": 1.7242398793976588e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3326 + }, + { + "completion_length": 2855.0, + "epoch": 0.5071646341463415, + "grad_norm": 0.06935571598547868, + "kl": 0.0499267578125, + "learning_rate": 1.723450648050091e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3327 + }, + { + "completion_length": 1805.6666870117188, + "epoch": 0.5073170731707317, + "grad_norm": 0.09252810952249389, + "kl": 0.0712890625, + "learning_rate": 1.7226613534338608e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3328 + }, + { + "completion_length": 1706.6667175292969, + "epoch": 0.5074695121951219, + "grad_norm": 0.10920694687100035, + "kl": 0.09228515625, + "learning_rate": 1.7218719957724514e-06, + "loss": 0.0037, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3329 + }, + { + "completion_length": 3326.8333740234375, + "epoch": 0.5076219512195121, + "grad_norm": 0.04045665271729233, + "kl": 0.0574951171875, + "learning_rate": 1.721082575289365e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3330 + }, + { + "completion_length": 1870.166748046875, + "epoch": 0.5077743902439025, + "grad_norm": 0.08607103588708466, + "kl": 0.0810546875, + "learning_rate": 1.7202930922081207e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3331 + }, + { + "completion_length": 1741.8333740234375, + "epoch": 0.5079268292682927, + "grad_norm": 0.08785918771023939, + "kl": 0.07080078125, + "learning_rate": 1.7195035467522556e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3332 + }, + { + "completion_length": 2346.666748046875, + "epoch": 0.5080792682926829, + "grad_norm": 0.1059049993208485, + "kl": 0.0733642578125, + "learning_rate": 1.7187139391453252e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3333 + }, + { + "completion_length": 1858.0000915527344, + "epoch": 0.5082317073170731, + "grad_norm": 0.23912903184014286, + "kl": 0.0947265625, + "learning_rate": 1.7179242696109013e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3334 + }, + { + "completion_length": 2327.5000610351562, + "epoch": 0.5083841463414634, + "grad_norm": 0.0754186372993523, + "kl": 0.0675048828125, + "learning_rate": 1.7171345383725748e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3335 + }, + { + "completion_length": 2292.8333740234375, + "epoch": 0.5085365853658537, + "grad_norm": 0.08666236036924216, + "kl": 0.078857421875, + "learning_rate": 1.716344745653952e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3336 + }, + { + "completion_length": 2749.5, + "epoch": 0.5086890243902439, + "grad_norm": 0.07530504059302844, + "kl": 0.067138671875, + "learning_rate": 1.7155548916786588e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3337 + }, + { + "completion_length": 2566.3333740234375, + "epoch": 0.5088414634146341, + "grad_norm": 0.06435320565438553, + "kl": 0.0650634765625, + "learning_rate": 1.7147649766703369e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3338 + }, + { + "completion_length": 1604.8333740234375, + "epoch": 0.5089939024390244, + "grad_norm": 0.130378381387198, + "kl": 0.081298828125, + "learning_rate": 1.7139750008526466e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3339 + }, + { + "completion_length": 2354.666717529297, + "epoch": 0.5091463414634146, + "grad_norm": 0.07353363242358411, + "kl": 0.0709228515625, + "learning_rate": 1.7131849644492634e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3340 + }, + { + "completion_length": 2808.0, + "epoch": 0.5092987804878049, + "grad_norm": 0.07030634971839232, + "kl": 0.059326171875, + "learning_rate": 1.7123948676838826e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3341 + }, + { + "completion_length": 2234.0001220703125, + "epoch": 0.5094512195121951, + "grad_norm": 0.07473502142178802, + "kl": 0.058349609375, + "learning_rate": 1.7116047107802139e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3342 + }, + { + "completion_length": 1816.666748046875, + "epoch": 0.5096036585365854, + "grad_norm": 0.1268687760987166, + "kl": 0.0723876953125, + "learning_rate": 1.7108144939619859e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3343 + }, + { + "completion_length": 1667.5000610351562, + "epoch": 0.5097560975609756, + "grad_norm": 0.08873218962234057, + "kl": 0.059326171875, + "learning_rate": 1.7100242174529439e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3344 + }, + { + "completion_length": 1951.0, + "epoch": 0.5099085365853658, + "grad_norm": 0.08444055711059119, + "kl": 0.0584716796875, + "learning_rate": 1.7092338814768491e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3345 + }, + { + "completion_length": 2543.666748046875, + "epoch": 0.510060975609756, + "grad_norm": 0.12971891624595053, + "kl": 0.066650390625, + "learning_rate": 1.7084434862574807e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3346 + }, + { + "completion_length": 3282.666748046875, + "epoch": 0.5102134146341464, + "grad_norm": 0.06334975414470266, + "kl": 0.06884765625, + "learning_rate": 1.7076530320186337e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3347 + }, + { + "completion_length": 1105.1666870117188, + "epoch": 0.5103658536585366, + "grad_norm": 0.12427217160237398, + "kl": 0.056396484375, + "learning_rate": 1.7068625189841213e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3348 + }, + { + "completion_length": 2078.8334350585938, + "epoch": 0.5105182926829268, + "grad_norm": 0.103938852335587, + "kl": 0.089599609375, + "learning_rate": 1.7060719473777715e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3349 + }, + { + "completion_length": 1734.1666870117188, + "epoch": 0.510670731707317, + "grad_norm": 0.17518672633717078, + "kl": 0.068115234375, + "learning_rate": 1.7052813174234304e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3350 + }, + { + "completion_length": 583.3333435058594, + "epoch": 0.5108231707317074, + "grad_norm": 0.21682331006562794, + "kl": 0.095703125, + "learning_rate": 1.704490629344959e-06, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3351 + }, + { + "completion_length": 2112.8333435058594, + "epoch": 0.5109756097560976, + "grad_norm": 0.12986925857108583, + "kl": 0.0648193359375, + "learning_rate": 1.7036998833662359e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3352 + }, + { + "completion_length": 1037.6666870117188, + "epoch": 0.5111280487804878, + "grad_norm": 0.11192076531067138, + "kl": 0.0908203125, + "learning_rate": 1.702909079711157e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3353 + }, + { + "completion_length": 2393.5000610351562, + "epoch": 0.511280487804878, + "grad_norm": 0.06240747498049684, + "kl": 0.068115234375, + "learning_rate": 1.7021182186036325e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3354 + }, + { + "completion_length": 1095.1666717529297, + "epoch": 0.5114329268292683, + "grad_norm": 0.20416197336059885, + "kl": 0.0867919921875, + "learning_rate": 1.7013273002675898e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3355 + }, + { + "completion_length": 922.5000305175781, + "epoch": 0.5115853658536585, + "grad_norm": 0.15472370557030674, + "kl": 0.084716796875, + "learning_rate": 1.7005363249269726e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3356 + }, + { + "completion_length": 2767.3333740234375, + "epoch": 0.5117378048780488, + "grad_norm": 0.07814864263901734, + "kl": 0.056884765625, + "learning_rate": 1.6997452928057413e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3357 + }, + { + "completion_length": 703.5000305175781, + "epoch": 0.511890243902439, + "grad_norm": 0.21264014107998305, + "kl": 0.07861328125, + "learning_rate": 1.698954204127871e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3358 + }, + { + "completion_length": 2500.8333740234375, + "epoch": 0.5120426829268293, + "grad_norm": 0.0818119575855432, + "kl": 0.0697021484375, + "learning_rate": 1.6981630591173537e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3359 + }, + { + "completion_length": 924.1666870117188, + "epoch": 0.5121951219512195, + "grad_norm": 0.12899299758163035, + "kl": 0.0623779296875, + "learning_rate": 1.6973718579981973e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3360 + }, + { + "completion_length": 1574.6667175292969, + "epoch": 0.5123475609756097, + "grad_norm": 0.08076763107772492, + "kl": 0.0550537109375, + "learning_rate": 1.6965806009944255e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3361 + }, + { + "completion_length": 949.0000305175781, + "epoch": 0.5125, + "grad_norm": 0.1451295458807033, + "kl": 0.0848388671875, + "learning_rate": 1.6957892883300778e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3362 + }, + { + "completion_length": 2483.0000610351562, + "epoch": 0.5126524390243903, + "grad_norm": 0.16995320222180335, + "kl": 0.084716796875, + "learning_rate": 1.694997920229209e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3363 + }, + { + "completion_length": 1118.166748046875, + "epoch": 0.5128048780487805, + "grad_norm": 0.18327170672287815, + "kl": 0.0869140625, + "learning_rate": 1.6942064969158907e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3364 + }, + { + "completion_length": 1588.166748046875, + "epoch": 0.5129573170731707, + "grad_norm": 0.11617501245203685, + "kl": 0.087158203125, + "learning_rate": 1.693415018614209e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3365 + }, + { + "completion_length": 3613.5, + "epoch": 0.5131097560975609, + "grad_norm": 0.05095084221879876, + "kl": 0.06103515625, + "learning_rate": 1.692623485548267e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3366 + }, + { + "completion_length": 2706.5001220703125, + "epoch": 0.5132621951219513, + "grad_norm": 0.09304618990391815, + "kl": 0.0693359375, + "learning_rate": 1.6918318979421812e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3367 + }, + { + "completion_length": 1692.3334197998047, + "epoch": 0.5134146341463415, + "grad_norm": 0.11936581675747311, + "kl": 0.0621337890625, + "learning_rate": 1.6910402560200854e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3368 + }, + { + "completion_length": 1112.6667175292969, + "epoch": 0.5135670731707317, + "grad_norm": 0.1500839331582503, + "kl": 0.093505859375, + "learning_rate": 1.6902485600061275e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3369 + }, + { + "completion_length": 1118.8333435058594, + "epoch": 0.5137195121951219, + "grad_norm": 2.0951273974209923, + "kl": 0.077880859375, + "learning_rate": 1.6894568101244725e-06, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3370 + }, + { + "completion_length": 842.1666870117188, + "epoch": 0.5138719512195122, + "grad_norm": 0.34858834508475695, + "kl": 0.0830078125, + "learning_rate": 1.6886650065992978e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3371 + }, + { + "completion_length": 2159.666748046875, + "epoch": 0.5140243902439025, + "grad_norm": 0.09300046710819401, + "kl": 0.0615234375, + "learning_rate": 1.6878731496547987e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3372 + }, + { + "completion_length": 3355.0, + "epoch": 0.5141768292682927, + "grad_norm": 0.06807618076611335, + "kl": 0.0654296875, + "learning_rate": 1.6870812395151849e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3373 + }, + { + "completion_length": 2026.3333740234375, + "epoch": 0.5143292682926829, + "grad_norm": 0.06991737301157185, + "kl": 0.068359375, + "learning_rate": 1.6862892764046799e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3374 + }, + { + "completion_length": 1710.0000610351562, + "epoch": 0.5144817073170732, + "grad_norm": 0.12393037290767016, + "kl": 0.080322265625, + "learning_rate": 1.6854972605475238e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3375 + }, + { + "completion_length": 1655.3334350585938, + "epoch": 0.5146341463414634, + "grad_norm": 0.10579376338723653, + "kl": 0.08203125, + "learning_rate": 1.6847051921679702e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3376 + }, + { + "completion_length": 1073.0000610351562, + "epoch": 0.5147865853658536, + "grad_norm": 0.0884656629365418, + "kl": 0.0579833984375, + "learning_rate": 1.6839130714902895e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3377 + }, + { + "completion_length": 2493.1666870117188, + "epoch": 0.5149390243902439, + "grad_norm": 0.07899529882733314, + "kl": 0.071044921875, + "learning_rate": 1.683120898738765e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3378 + }, + { + "completion_length": 1067.8333740234375, + "epoch": 0.5150914634146342, + "grad_norm": 0.15474834696458134, + "kl": 0.088623046875, + "learning_rate": 1.6823286741376956e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3379 + }, + { + "completion_length": 1814.3333435058594, + "epoch": 0.5152439024390244, + "grad_norm": 0.10856079699743255, + "kl": 0.0643310546875, + "learning_rate": 1.6815363979113947e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3380 + }, + { + "completion_length": 2446.6666870117188, + "epoch": 0.5153963414634146, + "grad_norm": 0.06861392808126651, + "kl": 0.063720703125, + "learning_rate": 1.6807440702841904e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3381 + }, + { + "completion_length": 1125.8333740234375, + "epoch": 0.5155487804878048, + "grad_norm": 0.08941147731252183, + "kl": 0.060546875, + "learning_rate": 1.6799516914804252e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3382 + }, + { + "completion_length": 1019.3333435058594, + "epoch": 0.5157012195121952, + "grad_norm": 0.09209441242516994, + "kl": 0.0728759765625, + "learning_rate": 1.679159261724457e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3383 + }, + { + "completion_length": 2708.8333740234375, + "epoch": 0.5158536585365854, + "grad_norm": 0.3562568348287704, + "kl": 0.095947265625, + "learning_rate": 1.6783667812406569e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3384 + }, + { + "completion_length": 3400.166748046875, + "epoch": 0.5160060975609756, + "grad_norm": 0.0492118819927923, + "kl": 0.05517578125, + "learning_rate": 1.6775742502534103e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3385 + }, + { + "completion_length": 816.6666870117188, + "epoch": 0.5161585365853658, + "grad_norm": 0.2317225854925365, + "kl": 0.08935546875, + "learning_rate": 1.6767816689871182e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3386 + }, + { + "completion_length": 3253.666748046875, + "epoch": 0.5163109756097561, + "grad_norm": 0.04669298249235535, + "kl": 0.05322265625, + "learning_rate": 1.6759890376661947e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3387 + }, + { + "completion_length": 2227.5, + "epoch": 0.5164634146341464, + "grad_norm": 0.07672303585910215, + "kl": 0.0614013671875, + "learning_rate": 1.6751963565150682e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3388 + }, + { + "completion_length": 785.5000305175781, + "epoch": 0.5166158536585366, + "grad_norm": 0.30625835966932385, + "kl": 0.0587158203125, + "learning_rate": 1.6744036257581819e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3389 + }, + { + "completion_length": 1016.3333740234375, + "epoch": 0.5167682926829268, + "grad_norm": 0.07846694689476315, + "kl": 0.0487060546875, + "learning_rate": 1.6736108456199923e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3390 + }, + { + "completion_length": 1172.6666870117188, + "epoch": 0.5169207317073171, + "grad_norm": 1.5796042912889685, + "kl": 0.089111328125, + "learning_rate": 1.6728180163249706e-06, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3391 + }, + { + "completion_length": 1637.5, + "epoch": 0.5170731707317073, + "grad_norm": 0.08235738105993895, + "kl": 0.066162109375, + "learning_rate": 1.672025138097601e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3392 + }, + { + "completion_length": 1047.3333740234375, + "epoch": 0.5172256097560975, + "grad_norm": 0.11693079517697542, + "kl": 0.051025390625, + "learning_rate": 1.671232211162382e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3393 + }, + { + "completion_length": 2338.0000915527344, + "epoch": 0.5173780487804878, + "grad_norm": 0.10058439280852159, + "kl": 0.0655517578125, + "learning_rate": 1.6704392357438263e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3394 + }, + { + "completion_length": 2700.83349609375, + "epoch": 0.5175304878048781, + "grad_norm": 0.1466106101508432, + "kl": 0.07470703125, + "learning_rate": 1.66964621206646e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3395 + }, + { + "completion_length": 1146.8333435058594, + "epoch": 0.5176829268292683, + "grad_norm": 1.847209854812627, + "kl": 0.115966796875, + "learning_rate": 1.6688531403548222e-06, + "loss": 0.0046, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3396 + }, + { + "completion_length": 3362.0001220703125, + "epoch": 0.5178353658536585, + "grad_norm": 0.05939872724457309, + "kl": 0.057861328125, + "learning_rate": 1.6680600208334673e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3397 + }, + { + "completion_length": 621.6666870117188, + "epoch": 0.5179878048780487, + "grad_norm": 0.225159183215177, + "kl": 0.1173095703125, + "learning_rate": 1.667266853726961e-06, + "loss": 0.0047, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3398 + }, + { + "completion_length": 1482.0000305175781, + "epoch": 0.5181402439024391, + "grad_norm": 0.09100266842353448, + "kl": 0.068359375, + "learning_rate": 1.666473639259884e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3399 + }, + { + "completion_length": 2386.6666870117188, + "epoch": 0.5182926829268293, + "grad_norm": 0.11411437739250806, + "kl": 0.071044921875, + "learning_rate": 1.6656803776568307e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3400 + }, + { + "completion_length": 1677.3333740234375, + "epoch": 0.5184451219512195, + "grad_norm": 0.2343849135528102, + "kl": 0.083984375, + "learning_rate": 1.6648870691424076e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3401 + }, + { + "completion_length": 3421.3333740234375, + "epoch": 0.5185975609756097, + "grad_norm": 0.033495180194194, + "kl": 0.0433349609375, + "learning_rate": 1.6640937139412351e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3402 + }, + { + "completion_length": 1577.166748046875, + "epoch": 0.51875, + "grad_norm": 0.13716337814478097, + "kl": 0.08349609375, + "learning_rate": 1.663300312277947e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3403 + }, + { + "completion_length": 1955.5, + "epoch": 0.5189024390243903, + "grad_norm": 0.08516351735048337, + "kl": 0.0625, + "learning_rate": 1.6625068643771898e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3404 + }, + { + "completion_length": 1694.0, + "epoch": 0.5190548780487805, + "grad_norm": 0.0614993339705026, + "kl": 0.055419921875, + "learning_rate": 1.6617133704636234e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3405 + }, + { + "completion_length": 1052.6666717529297, + "epoch": 0.5192073170731707, + "grad_norm": 0.13477685420771687, + "kl": 0.0675048828125, + "learning_rate": 1.6609198307619212e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3406 + }, + { + "completion_length": 1278.5000610351562, + "epoch": 0.519359756097561, + "grad_norm": 0.1088701776160035, + "kl": 0.083740234375, + "learning_rate": 1.6601262454967683e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3407 + }, + { + "completion_length": 1407.1666870117188, + "epoch": 0.5195121951219512, + "grad_norm": 0.08761751390429551, + "kl": 0.062744140625, + "learning_rate": 1.6593326148928643e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3408 + }, + { + "completion_length": 707.6666870117188, + "epoch": 0.5196646341463415, + "grad_norm": 0.13074746335459733, + "kl": 0.06982421875, + "learning_rate": 1.6585389391749195e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3409 + }, + { + "completion_length": 1964.3333740234375, + "epoch": 0.5198170731707317, + "grad_norm": 0.11688290382859914, + "kl": 0.08642578125, + "learning_rate": 1.657745218567659e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3410 + }, + { + "completion_length": 2085.5001220703125, + "epoch": 0.519969512195122, + "grad_norm": 0.18980820544482535, + "kl": 0.083984375, + "learning_rate": 1.6569514532958199e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3411 + }, + { + "completion_length": 1099.5000305175781, + "epoch": 0.5201219512195122, + "grad_norm": 0.0853996754152971, + "kl": 0.068115234375, + "learning_rate": 1.6561576435841515e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3412 + }, + { + "completion_length": 2585.3334350585938, + "epoch": 0.5202743902439024, + "grad_norm": 0.19382366807027146, + "kl": 0.0797119140625, + "learning_rate": 1.655363789657417e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3413 + }, + { + "completion_length": 2016.0000610351562, + "epoch": 0.5204268292682926, + "grad_norm": 0.06587617118601956, + "kl": 0.0548095703125, + "learning_rate": 1.6545698917403906e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3414 + }, + { + "completion_length": 2209.5001220703125, + "epoch": 0.520579268292683, + "grad_norm": 0.08008990599690503, + "kl": 0.0576171875, + "learning_rate": 1.6537759500578593e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3415 + }, + { + "completion_length": 1741.0000915527344, + "epoch": 0.5207317073170732, + "grad_norm": 0.0979035424103804, + "kl": 0.0589599609375, + "learning_rate": 1.652981964834623e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3416 + }, + { + "completion_length": 2478.666748046875, + "epoch": 0.5208841463414634, + "grad_norm": 1.0795362355078117, + "kl": 0.055908203125, + "learning_rate": 1.6521879362954943e-06, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3417 + }, + { + "completion_length": 1976.5000610351562, + "epoch": 0.5210365853658536, + "grad_norm": 0.331548851782418, + "kl": 0.0672607421875, + "learning_rate": 1.6513938646652965e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3418 + }, + { + "completion_length": 2081.166748046875, + "epoch": 0.521189024390244, + "grad_norm": 0.082318195822822, + "kl": 0.0777587890625, + "learning_rate": 1.650599750168866e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3419 + }, + { + "completion_length": 3284.8333740234375, + "epoch": 0.5213414634146342, + "grad_norm": 0.05644131677051691, + "kl": 0.056396484375, + "learning_rate": 1.6498055930310522e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3420 + }, + { + "completion_length": 1240.6666870117188, + "epoch": 0.5214939024390244, + "grad_norm": 0.13654200888990814, + "kl": 0.072021484375, + "learning_rate": 1.649011393476715e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3421 + }, + { + "completion_length": 1431.8333740234375, + "epoch": 0.5216463414634146, + "grad_norm": 0.24520462947643162, + "kl": 0.0614013671875, + "learning_rate": 1.6482171517307281e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3422 + }, + { + "completion_length": 1520.6666870117188, + "epoch": 0.5217987804878049, + "grad_norm": 0.1084472436771154, + "kl": 0.08447265625, + "learning_rate": 1.647422868017975e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3423 + }, + { + "completion_length": 1847.666748046875, + "epoch": 0.5219512195121951, + "grad_norm": 0.1136779378600279, + "kl": 0.0654296875, + "learning_rate": 1.6466285425633527e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3424 + }, + { + "completion_length": 1873.5, + "epoch": 0.5221036585365854, + "grad_norm": 0.11148943088220414, + "kl": 0.060546875, + "learning_rate": 1.6458341755917696e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3425 + }, + { + "completion_length": 2289.166748046875, + "epoch": 0.5222560975609756, + "grad_norm": 0.06760152237452, + "kl": 0.0517578125, + "learning_rate": 1.645039767328146e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3426 + }, + { + "completion_length": 2443.3333740234375, + "epoch": 0.5224085365853659, + "grad_norm": 0.0728106850326619, + "kl": 0.0528564453125, + "learning_rate": 1.6442453179974125e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3427 + }, + { + "completion_length": 3770.3333740234375, + "epoch": 0.5225609756097561, + "grad_norm": 0.04430847286881083, + "kl": 0.04345703125, + "learning_rate": 1.6434508278245136e-06, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3428 + }, + { + "completion_length": 921.8333435058594, + "epoch": 0.5227134146341463, + "grad_norm": 0.15191566808626739, + "kl": 0.084716796875, + "learning_rate": 1.6426562970344039e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3429 + }, + { + "completion_length": 2956.83349609375, + "epoch": 0.5228658536585366, + "grad_norm": 0.08565516815155741, + "kl": 0.06396484375, + "learning_rate": 1.6418617258520498e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3430 + }, + { + "completion_length": 1665.8333740234375, + "epoch": 0.5230182926829269, + "grad_norm": 0.07697683792615002, + "kl": 0.06298828125, + "learning_rate": 1.6410671145024296e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3431 + }, + { + "completion_length": 932.5, + "epoch": 0.5231707317073171, + "grad_norm": 0.10301959848774571, + "kl": 0.073486328125, + "learning_rate": 1.6402724632105323e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3432 + }, + { + "completion_length": 1586.5000610351562, + "epoch": 0.5233231707317073, + "grad_norm": 0.24358448977669167, + "kl": 0.113037109375, + "learning_rate": 1.6394777722013586e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3433 + }, + { + "completion_length": 3387.166748046875, + "epoch": 0.5234756097560975, + "grad_norm": 0.06427634293818947, + "kl": 0.0440673828125, + "learning_rate": 1.63868304169992e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3434 + }, + { + "completion_length": 539.5000152587891, + "epoch": 0.5236280487804879, + "grad_norm": 2.452259787183975, + "kl": 0.12109375, + "learning_rate": 1.6378882719312396e-06, + "loss": 0.0049, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3435 + }, + { + "completion_length": 1044.5000610351562, + "epoch": 0.5237804878048781, + "grad_norm": 0.15133391289929907, + "kl": 0.0633544921875, + "learning_rate": 1.6370934631203516e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3436 + }, + { + "completion_length": 2314.0001220703125, + "epoch": 0.5239329268292683, + "grad_norm": 0.6775053778347393, + "kl": 0.0849609375, + "learning_rate": 1.6362986154923011e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3437 + }, + { + "completion_length": 901.1666870117188, + "epoch": 0.5240853658536585, + "grad_norm": 0.13490140326620065, + "kl": 0.04248046875, + "learning_rate": 1.6355037292721442e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3438 + }, + { + "completion_length": 1097.1666870117188, + "epoch": 0.5242378048780488, + "grad_norm": 0.08756443623044795, + "kl": 0.0582275390625, + "learning_rate": 1.6347088046849483e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3439 + }, + { + "completion_length": 3494.3333740234375, + "epoch": 0.524390243902439, + "grad_norm": 0.03809420083717607, + "kl": 0.043701171875, + "learning_rate": 1.6339138419557916e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3440 + }, + { + "completion_length": 2995.0, + "epoch": 0.5245426829268293, + "grad_norm": 0.08996822191620751, + "kl": 0.060302734375, + "learning_rate": 1.633118841309762e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3441 + }, + { + "completion_length": 1295.5000305175781, + "epoch": 0.5246951219512195, + "grad_norm": 0.09967624469151305, + "kl": 0.0660400390625, + "learning_rate": 1.63232380297196e-06, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3442 + }, + { + "completion_length": 3968.166748046875, + "epoch": 0.5248475609756098, + "grad_norm": 0.04412071191982435, + "kl": 0.047607421875, + "learning_rate": 1.6315287271674947e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3443 + }, + { + "completion_length": 665.5000152587891, + "epoch": 0.525, + "grad_norm": 0.1656652518273253, + "kl": 0.09814453125, + "learning_rate": 1.6307336141214877e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3444 + }, + { + "completion_length": 1842.666748046875, + "epoch": 0.5251524390243902, + "grad_norm": 0.08794986535155545, + "kl": 0.06494140625, + "learning_rate": 1.6299384640590696e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3445 + }, + { + "completion_length": 1770.166748046875, + "epoch": 0.5253048780487805, + "grad_norm": 0.08164330209202261, + "kl": 0.0501708984375, + "learning_rate": 1.6291432772053828e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3446 + }, + { + "completion_length": 2143.1666870117188, + "epoch": 0.5254573170731708, + "grad_norm": 0.09710526065434288, + "kl": 0.063720703125, + "learning_rate": 1.6283480537855793e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3447 + }, + { + "completion_length": 2847.6666870117188, + "epoch": 0.525609756097561, + "grad_norm": 0.10660427066590163, + "kl": 0.06201171875, + "learning_rate": 1.6275527940248218e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3448 + }, + { + "completion_length": 2384.8333740234375, + "epoch": 0.5257621951219512, + "grad_norm": 0.09444464800177454, + "kl": 0.0594482421875, + "learning_rate": 1.626757498148283e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3449 + }, + { + "completion_length": 2147.6666870117188, + "epoch": 0.5259146341463414, + "grad_norm": 0.08300669826035366, + "kl": 0.057373046875, + "learning_rate": 1.625962166381146e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3450 + }, + { + "completion_length": 3044.6666870117188, + "epoch": 0.5260670731707318, + "grad_norm": 1.8037586924592806, + "kl": 0.0552978515625, + "learning_rate": 1.6251667989486044e-06, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3451 + }, + { + "completion_length": 2644.6666870117188, + "epoch": 0.526219512195122, + "grad_norm": 0.7745541961727912, + "kl": 0.05712890625, + "learning_rate": 1.6243713960758608e-06, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3452 + }, + { + "completion_length": 2069.166748046875, + "epoch": 0.5263719512195122, + "grad_norm": 0.08845205124543441, + "kl": 0.0572509765625, + "learning_rate": 1.6235759579881295e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3453 + }, + { + "completion_length": 2324.666748046875, + "epoch": 0.5265243902439024, + "grad_norm": 0.0929670444583471, + "kl": 0.056640625, + "learning_rate": 1.622780484910633e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3454 + }, + { + "completion_length": 2938.83349609375, + "epoch": 0.5266768292682927, + "grad_norm": 0.08712498867770989, + "kl": 0.0635986328125, + "learning_rate": 1.6219849770686051e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3455 + }, + { + "completion_length": 1342.3334350585938, + "epoch": 0.526829268292683, + "grad_norm": 0.11939700633840418, + "kl": 0.082763671875, + "learning_rate": 1.6211894346872887e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3456 + }, + { + "completion_length": 3157.3333740234375, + "epoch": 0.5269817073170732, + "grad_norm": 0.05675666368696733, + "kl": 0.0474853515625, + "learning_rate": 1.620393857991937e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3457 + }, + { + "completion_length": 4096.0, + "epoch": 0.5271341463414634, + "grad_norm": 0.6011144358093851, + "kl": 0.049560546875, + "learning_rate": 1.6195982472078117e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3458 + }, + { + "completion_length": 1735.3334350585938, + "epoch": 0.5272865853658537, + "grad_norm": 0.12408236364886341, + "kl": 0.076416015625, + "learning_rate": 1.618802602560186e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3459 + }, + { + "completion_length": 3900.0, + "epoch": 0.5274390243902439, + "grad_norm": 0.03363854281968253, + "kl": 0.042236328125, + "learning_rate": 1.6180069242743416e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3460 + }, + { + "completion_length": 3004.3333740234375, + "epoch": 0.5275914634146341, + "grad_norm": 0.056189581022851064, + "kl": 0.06005859375, + "learning_rate": 1.6172112125755694e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3461 + }, + { + "completion_length": 4035.3333740234375, + "epoch": 0.5277439024390244, + "grad_norm": 0.035963788982805285, + "kl": 0.0413818359375, + "learning_rate": 1.616415467689171e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3462 + }, + { + "completion_length": 2709.33349609375, + "epoch": 0.5278963414634147, + "grad_norm": 0.06090345927658237, + "kl": 0.0675048828125, + "learning_rate": 1.6156196898404557e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3463 + }, + { + "completion_length": 3663.5, + "epoch": 0.5280487804878049, + "grad_norm": 0.044865490887386464, + "kl": 0.0419921875, + "learning_rate": 1.614823879254744e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3464 + }, + { + "completion_length": 1780.5000915527344, + "epoch": 0.5282012195121951, + "grad_norm": 0.1264434702035922, + "kl": 0.072021484375, + "learning_rate": 1.6140280361573635e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3465 + }, + { + "completion_length": 3015.5001220703125, + "epoch": 0.5283536585365853, + "grad_norm": 0.08150958865870277, + "kl": 0.0584716796875, + "learning_rate": 1.613232160773653e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3466 + }, + { + "completion_length": 2267.5000610351562, + "epoch": 0.5285060975609757, + "grad_norm": 0.0767193283404, + "kl": 0.0521240234375, + "learning_rate": 1.6124362533289598e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3467 + }, + { + "completion_length": 2495.6666870117188, + "epoch": 0.5286585365853659, + "grad_norm": 0.07647215740801713, + "kl": 0.0572509765625, + "learning_rate": 1.6116403140486397e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3468 + }, + { + "completion_length": 711.3333435058594, + "epoch": 0.5288109756097561, + "grad_norm": 0.15933141433339382, + "kl": 0.09521484375, + "learning_rate": 1.610844343158059e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3469 + }, + { + "completion_length": 3131.0, + "epoch": 0.5289634146341463, + "grad_norm": 0.05424757110892845, + "kl": 0.04931640625, + "learning_rate": 1.6100483408825904e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3470 + }, + { + "completion_length": 1503.0, + "epoch": 0.5291158536585366, + "grad_norm": 0.07844684864962138, + "kl": 0.0748291015625, + "learning_rate": 1.6092523074476183e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3471 + }, + { + "completion_length": 1264.0000610351562, + "epoch": 0.5292682926829269, + "grad_norm": 0.127455242331324, + "kl": 0.1005859375, + "learning_rate": 1.6084562430785336e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3472 + }, + { + "completion_length": 3028.666748046875, + "epoch": 0.5294207317073171, + "grad_norm": 0.041337823860312796, + "kl": 0.0447998046875, + "learning_rate": 1.6076601480007375e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3473 + }, + { + "completion_length": 3159.8333740234375, + "epoch": 0.5295731707317073, + "grad_norm": 0.05582918870603229, + "kl": 0.0518798828125, + "learning_rate": 1.6068640224396393e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3474 + }, + { + "completion_length": 2269.8333740234375, + "epoch": 0.5297256097560976, + "grad_norm": 0.13223629380902777, + "kl": 0.054443359375, + "learning_rate": 1.606067866620657e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3475 + }, + { + "completion_length": 2315.3334350585938, + "epoch": 0.5298780487804878, + "grad_norm": 0.0824087946968139, + "kl": 0.0599365234375, + "learning_rate": 1.605271680769217e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3476 + }, + { + "completion_length": 2754.0000610351562, + "epoch": 0.530030487804878, + "grad_norm": 0.046126599174317114, + "kl": 0.0482177734375, + "learning_rate": 1.604475465110755e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3477 + }, + { + "completion_length": 1572.5000915527344, + "epoch": 0.5301829268292683, + "grad_norm": 0.11039973184008711, + "kl": 0.0615234375, + "learning_rate": 1.603679219870714e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3478 + }, + { + "completion_length": 3392.166748046875, + "epoch": 0.5303353658536586, + "grad_norm": 0.03724363562375838, + "kl": 0.0467529296875, + "learning_rate": 1.6028829452745454e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3479 + }, + { + "completion_length": 747.3333435058594, + "epoch": 0.5304878048780488, + "grad_norm": 0.09686690971388233, + "kl": 0.0560302734375, + "learning_rate": 1.6020866415477108e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3480 + }, + { + "completion_length": 1018.8333740234375, + "epoch": 0.530640243902439, + "grad_norm": 0.08675004417752538, + "kl": 0.0460205078125, + "learning_rate": 1.6012903089156776e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3481 + }, + { + "completion_length": 1593.6666870117188, + "epoch": 0.5307926829268292, + "grad_norm": 0.10634852774497487, + "kl": 0.0692138671875, + "learning_rate": 1.6004939476039226e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3482 + }, + { + "completion_length": 2849.5001220703125, + "epoch": 0.5309451219512196, + "grad_norm": 0.04705694348088077, + "kl": 0.056396484375, + "learning_rate": 1.5996975578379306e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3483 + }, + { + "completion_length": 2368.666748046875, + "epoch": 0.5310975609756098, + "grad_norm": 0.08674738382465123, + "kl": 0.051025390625, + "learning_rate": 1.5989011398431943e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3484 + }, + { + "completion_length": 1199.1666870117188, + "epoch": 0.53125, + "grad_norm": 0.14340634830779866, + "kl": 0.08642578125, + "learning_rate": 1.5981046938452148e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3485 + }, + { + "completion_length": 993.0000305175781, + "epoch": 0.5314024390243902, + "grad_norm": 0.12354042772256368, + "kl": 0.061767578125, + "learning_rate": 1.5973082200695004e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3486 + }, + { + "completion_length": 2547.0001220703125, + "epoch": 0.5315548780487804, + "grad_norm": 0.07690796914612408, + "kl": 0.074462890625, + "learning_rate": 1.5965117187415685e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3487 + }, + { + "completion_length": 2197.0000610351562, + "epoch": 0.5317073170731708, + "grad_norm": 0.1149124802747039, + "kl": 0.0565185546875, + "learning_rate": 1.5957151900869425e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3488 + }, + { + "completion_length": 2768.0001220703125, + "epoch": 0.531859756097561, + "grad_norm": 0.08299153338264834, + "kl": 0.0589599609375, + "learning_rate": 1.5949186343311558e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3489 + }, + { + "completion_length": 2114.5, + "epoch": 0.5320121951219512, + "grad_norm": 0.06996570217465338, + "kl": 0.072998046875, + "learning_rate": 1.5941220516997466e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3490 + }, + { + "completion_length": 2664.166748046875, + "epoch": 0.5321646341463414, + "grad_norm": 0.06611089022137229, + "kl": 0.066162109375, + "learning_rate": 1.593325442418264e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3491 + }, + { + "completion_length": 1662.5, + "epoch": 0.5323170731707317, + "grad_norm": 0.10910783461892373, + "kl": 0.05810546875, + "learning_rate": 1.5925288067122614e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3492 + }, + { + "completion_length": 2035.166748046875, + "epoch": 0.532469512195122, + "grad_norm": 0.13251208959474514, + "kl": 0.06787109375, + "learning_rate": 1.5917321448073019e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3493 + }, + { + "completion_length": 1472.1666870117188, + "epoch": 0.5326219512195122, + "grad_norm": 0.11865880308341561, + "kl": 0.0765380859375, + "learning_rate": 1.590935456928956e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3494 + }, + { + "completion_length": 2955.666748046875, + "epoch": 0.5327743902439024, + "grad_norm": 0.05488727064780662, + "kl": 0.053955078125, + "learning_rate": 1.5901387433028004e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3495 + }, + { + "completion_length": 1640.0, + "epoch": 0.5329268292682927, + "grad_norm": 0.16484664542032412, + "kl": 0.08447265625, + "learning_rate": 1.5893420041544193e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3496 + }, + { + "completion_length": 1988.0000610351562, + "epoch": 0.5330792682926829, + "grad_norm": 0.07061056649557695, + "kl": 0.054443359375, + "learning_rate": 1.5885452397094045e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3497 + }, + { + "completion_length": 1103.0000305175781, + "epoch": 0.5332317073170731, + "grad_norm": 0.11725526333801241, + "kl": 0.068115234375, + "learning_rate": 1.5877484501933557e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3498 + }, + { + "completion_length": 1580.0, + "epoch": 0.5333841463414634, + "grad_norm": 0.12027802589099248, + "kl": 0.0908203125, + "learning_rate": 1.586951635831878e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3499 + }, + { + "completion_length": 1843.0000610351562, + "epoch": 0.5335365853658537, + "grad_norm": 0.08786265492425577, + "kl": 0.06591796875, + "learning_rate": 1.5861547968505853e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3500 + }, + { + "completion_length": 1507.5000610351562, + "epoch": 0.5336890243902439, + "grad_norm": 0.12240687044142619, + "kl": 0.095947265625, + "learning_rate": 1.5853579334750964e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3501 + }, + { + "completion_length": 974.6667175292969, + "epoch": 0.5338414634146341, + "grad_norm": 0.1001181821829008, + "kl": 0.0655517578125, + "learning_rate": 1.5845610459310392e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3502 + }, + { + "completion_length": 1663.666748046875, + "epoch": 0.5339939024390243, + "grad_norm": 0.094465992168966, + "kl": 0.08251953125, + "learning_rate": 1.583764134444047e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3503 + }, + { + "completion_length": 2158.8333740234375, + "epoch": 0.5341463414634147, + "grad_norm": 1.9706442146744716, + "kl": 0.078369140625, + "learning_rate": 1.582967199239761e-06, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3504 + }, + { + "completion_length": 873.0, + "epoch": 0.5342987804878049, + "grad_norm": 2.262241523947767, + "kl": 0.08447265625, + "learning_rate": 1.5821702405438272e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3505 + }, + { + "completion_length": 1689.0001220703125, + "epoch": 0.5344512195121951, + "grad_norm": 0.07939041707329848, + "kl": 0.076904296875, + "learning_rate": 1.5813732585819006e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3506 + }, + { + "completion_length": 1358.3333740234375, + "epoch": 0.5346036585365853, + "grad_norm": 0.10102098738325375, + "kl": 0.0665283203125, + "learning_rate": 1.5805762535796417e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3507 + }, + { + "completion_length": 1136.3333740234375, + "epoch": 0.5347560975609756, + "grad_norm": 0.16281337478969, + "kl": 0.0830078125, + "learning_rate": 1.5797792257627168e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3508 + }, + { + "completion_length": 1291.3333740234375, + "epoch": 0.5349085365853659, + "grad_norm": 0.10280761505097526, + "kl": 0.089111328125, + "learning_rate": 1.5789821753568e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3509 + }, + { + "completion_length": 930.6666870117188, + "epoch": 0.5350609756097561, + "grad_norm": 0.10660703846583686, + "kl": 0.0635986328125, + "learning_rate": 1.5781851025875704e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3510 + }, + { + "completion_length": 690.3333435058594, + "epoch": 0.5352134146341463, + "grad_norm": 0.13549791304274525, + "kl": 0.056640625, + "learning_rate": 1.5773880076807152e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3511 + }, + { + "completion_length": 789.3333435058594, + "epoch": 0.5353658536585366, + "grad_norm": 0.34261385503748354, + "kl": 0.11328125, + "learning_rate": 1.5765908908619258e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3512 + }, + { + "completion_length": 945.5, + "epoch": 0.5355182926829268, + "grad_norm": 0.12726498233642702, + "kl": 0.076904296875, + "learning_rate": 1.5757937523569022e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3513 + }, + { + "completion_length": 648.5, + "epoch": 0.535670731707317, + "grad_norm": 3.182270051212551, + "kl": 0.0787353515625, + "learning_rate": 1.574996592391348e-06, + "loss": 0.0032, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3514 + }, + { + "completion_length": 721.5000305175781, + "epoch": 0.5358231707317073, + "grad_norm": 0.13044165676507352, + "kl": 0.103515625, + "learning_rate": 1.5741994111909746e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3515 + }, + { + "completion_length": 895.1666870117188, + "epoch": 0.5359756097560976, + "grad_norm": 0.1658484607276027, + "kl": 0.075927734375, + "learning_rate": 1.573402208981499e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3516 + }, + { + "completion_length": 1215.3333435058594, + "epoch": 0.5361280487804878, + "grad_norm": 1.7511169990576285, + "kl": 0.091552734375, + "learning_rate": 1.5726049859886437e-06, + "loss": 0.0037, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3517 + }, + { + "completion_length": 1183.5000305175781, + "epoch": 0.536280487804878, + "grad_norm": 0.11275358645163666, + "kl": 0.077880859375, + "learning_rate": 1.571807742438138e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3518 + }, + { + "completion_length": 1313.8333740234375, + "epoch": 0.5364329268292682, + "grad_norm": 0.1006346864089129, + "kl": 0.0675048828125, + "learning_rate": 1.571010478555716e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3519 + }, + { + "completion_length": 903.5000305175781, + "epoch": 0.5365853658536586, + "grad_norm": 0.08388299418101737, + "kl": 0.0489501953125, + "learning_rate": 1.5702131945671182e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3520 + }, + { + "completion_length": 795.0, + "epoch": 0.5367378048780488, + "grad_norm": 0.1707079377393071, + "kl": 0.10400390625, + "learning_rate": 1.56941589069809e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3521 + }, + { + "completion_length": 1387.6666870117188, + "epoch": 0.536890243902439, + "grad_norm": 1.1478616510058053, + "kl": 0.08251953125, + "learning_rate": 1.5686185671743842e-06, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3522 + }, + { + "completion_length": 751.1666870117188, + "epoch": 0.5370426829268292, + "grad_norm": 0.1451313627019514, + "kl": 0.084228515625, + "learning_rate": 1.567821224221757e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3523 + }, + { + "completion_length": 819.1666870117188, + "epoch": 0.5371951219512195, + "grad_norm": 2.0528535791777505, + "kl": 0.09912109375, + "learning_rate": 1.5670238620659717e-06, + "loss": 0.004, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3524 + }, + { + "completion_length": 1813.5000610351562, + "epoch": 0.5373475609756098, + "grad_norm": 0.0698809541916195, + "kl": 0.060791015625, + "learning_rate": 1.5662264809327964e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3525 + }, + { + "completion_length": 1144.6666870117188, + "epoch": 0.5375, + "grad_norm": 0.19724150272956284, + "kl": 0.082763671875, + "learning_rate": 1.5654290810480041e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3526 + }, + { + "completion_length": 1250.0, + "epoch": 0.5376524390243902, + "grad_norm": 0.1093519300653968, + "kl": 0.077880859375, + "learning_rate": 1.5646316626373742e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3527 + }, + { + "completion_length": 2164.1666870117188, + "epoch": 0.5378048780487805, + "grad_norm": 0.0758496330945465, + "kl": 0.06201171875, + "learning_rate": 1.5638342259266904e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3528 + }, + { + "completion_length": 949.3333435058594, + "epoch": 0.5379573170731707, + "grad_norm": 0.1282677132772872, + "kl": 0.09619140625, + "learning_rate": 1.5630367711417423e-06, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3529 + }, + { + "completion_length": 1502.666748046875, + "epoch": 0.538109756097561, + "grad_norm": 0.15072573136641998, + "kl": 0.070068359375, + "learning_rate": 1.5622392985083234e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3530 + }, + { + "completion_length": 1737.0000610351562, + "epoch": 0.5382621951219512, + "grad_norm": 0.19959182711104914, + "kl": 0.099609375, + "learning_rate": 1.5614418082522346e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3531 + }, + { + "completion_length": 1607.3333740234375, + "epoch": 0.5384146341463415, + "grad_norm": 1.1304663623001778, + "kl": 0.0633544921875, + "learning_rate": 1.5606443005992789e-06, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3532 + }, + { + "completion_length": 1574.0000915527344, + "epoch": 0.5385670731707317, + "grad_norm": 0.12328957859202773, + "kl": 0.081787109375, + "learning_rate": 1.5598467757752662e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3533 + }, + { + "completion_length": 608.3333435058594, + "epoch": 0.5387195121951219, + "grad_norm": 0.17274636471489388, + "kl": 0.13818359375, + "learning_rate": 1.5590492340060113e-06, + "loss": 0.0055, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3534 + }, + { + "completion_length": 2717.5, + "epoch": 0.5388719512195121, + "grad_norm": 1.7268048504813096, + "kl": 0.05908203125, + "learning_rate": 1.5582516755173323e-06, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3535 + }, + { + "completion_length": 1974.666748046875, + "epoch": 0.5390243902439025, + "grad_norm": 0.21564564052393256, + "kl": 0.073486328125, + "learning_rate": 1.5574541005350532e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3536 + }, + { + "completion_length": 1999.0001220703125, + "epoch": 0.5391768292682927, + "grad_norm": 0.0784842559264364, + "kl": 0.070556640625, + "learning_rate": 1.5566565092850024e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3537 + }, + { + "completion_length": 1947.8333740234375, + "epoch": 0.5393292682926829, + "grad_norm": 0.11707992412277095, + "kl": 0.093017578125, + "learning_rate": 1.5558589019930132e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3538 + }, + { + "completion_length": 1677.8334350585938, + "epoch": 0.5394817073170731, + "grad_norm": 0.07479911078341298, + "kl": 0.0604248046875, + "learning_rate": 1.5550612788849223e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3539 + }, + { + "completion_length": 1564.8334350585938, + "epoch": 0.5396341463414634, + "grad_norm": 0.1083451020970895, + "kl": 0.0738525390625, + "learning_rate": 1.5542636401865733e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3540 + }, + { + "completion_length": 1233.3333435058594, + "epoch": 0.5397865853658537, + "grad_norm": 0.2082214370709059, + "kl": 0.078857421875, + "learning_rate": 1.553465986123811e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3541 + }, + { + "completion_length": 2185.0, + "epoch": 0.5399390243902439, + "grad_norm": 0.10760200928703907, + "kl": 0.072509765625, + "learning_rate": 1.5526683169224873e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3542 + }, + { + "completion_length": 1872.166748046875, + "epoch": 0.5400914634146341, + "grad_norm": 1.4037443711965596, + "kl": 0.087890625, + "learning_rate": 1.5518706328084564e-06, + "loss": 0.0035, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3543 + }, + { + "completion_length": 1441.0, + "epoch": 0.5402439024390244, + "grad_norm": 0.14756428256026033, + "kl": 0.095947265625, + "learning_rate": 1.5510729340075781e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3544 + }, + { + "completion_length": 1025.3333740234375, + "epoch": 0.5403963414634146, + "grad_norm": 2.1134659404934015, + "kl": 0.106201171875, + "learning_rate": 1.5502752207457163e-06, + "loss": 0.0042, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3545 + }, + { + "completion_length": 2185.0000610351562, + "epoch": 0.5405487804878049, + "grad_norm": 0.09365063576032981, + "kl": 0.090576171875, + "learning_rate": 1.5494774932487375e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3546 + }, + { + "completion_length": 1392.666748046875, + "epoch": 0.5407012195121951, + "grad_norm": 0.10334192160771664, + "kl": 0.10107421875, + "learning_rate": 1.5486797517425144e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3547 + }, + { + "completion_length": 1545.0000610351562, + "epoch": 0.5408536585365854, + "grad_norm": 0.290042243642026, + "kl": 0.0626220703125, + "learning_rate": 1.5478819964529216e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3548 + }, + { + "completion_length": 949.5000610351562, + "epoch": 0.5410060975609756, + "grad_norm": 0.07454625528214864, + "kl": 0.045166015625, + "learning_rate": 1.5470842276058389e-06, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3549 + }, + { + "completion_length": 1612.5, + "epoch": 0.5411585365853658, + "grad_norm": 0.10351879861829165, + "kl": 0.0849609375, + "learning_rate": 1.5462864454271498e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3550 + }, + { + "completion_length": 1076.6666870117188, + "epoch": 0.541310975609756, + "grad_norm": 0.12103274295039511, + "kl": 0.09716796875, + "learning_rate": 1.5454886501427413e-06, + "loss": 0.0039, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3551 + }, + { + "completion_length": 2039.6666870117188, + "epoch": 0.5414634146341464, + "grad_norm": 0.08431700641635768, + "kl": 0.078125, + "learning_rate": 1.544690841978504e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3552 + }, + { + "completion_length": 1116.0000305175781, + "epoch": 0.5416158536585366, + "grad_norm": 0.0975569206775406, + "kl": 0.048828125, + "learning_rate": 1.5438930211603325e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3553 + }, + { + "completion_length": 2283.3333740234375, + "epoch": 0.5417682926829268, + "grad_norm": 0.06549556315614079, + "kl": 0.0733642578125, + "learning_rate": 1.5430951879141248e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3554 + }, + { + "completion_length": 2408.1666870117188, + "epoch": 0.541920731707317, + "grad_norm": 0.08786353982865563, + "kl": 0.073974609375, + "learning_rate": 1.542297342465782e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3555 + }, + { + "completion_length": 1586.666748046875, + "epoch": 0.5420731707317074, + "grad_norm": 2.0126999546355586, + "kl": 0.0927734375, + "learning_rate": 1.5414994850412102e-06, + "loss": 0.0037, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3556 + }, + { + "completion_length": 2118.3334350585938, + "epoch": 0.5422256097560976, + "grad_norm": 0.07514250881111348, + "kl": 0.082275390625, + "learning_rate": 1.5407016158663162e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3557 + }, + { + "completion_length": 2486.5001220703125, + "epoch": 0.5423780487804878, + "grad_norm": 1.449711917716851, + "kl": 0.087158203125, + "learning_rate": 1.5399037351670126e-06, + "loss": 0.0035, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3558 + }, + { + "completion_length": 1980.3333435058594, + "epoch": 0.542530487804878, + "grad_norm": 0.08078094552270362, + "kl": 0.0570068359375, + "learning_rate": 1.5391058431692144e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3559 + }, + { + "completion_length": 3444.83349609375, + "epoch": 0.5426829268292683, + "grad_norm": 0.7449211560590127, + "kl": 0.067138671875, + "learning_rate": 1.5383079400988402e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3560 + }, + { + "completion_length": 1185.166748046875, + "epoch": 0.5428353658536585, + "grad_norm": 0.09178375534982379, + "kl": 0.0615234375, + "learning_rate": 1.53751002618181e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3561 + }, + { + "completion_length": 1300.3333740234375, + "epoch": 0.5429878048780488, + "grad_norm": 0.11895277588043535, + "kl": 0.085693359375, + "learning_rate": 1.5367121016440491e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3562 + }, + { + "completion_length": 2350.166748046875, + "epoch": 0.543140243902439, + "grad_norm": 0.06995235527407505, + "kl": 0.0606689453125, + "learning_rate": 1.535914166711485e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3563 + }, + { + "completion_length": 1431.8333740234375, + "epoch": 0.5432926829268293, + "grad_norm": 1.0557446213443735, + "kl": 0.1044921875, + "learning_rate": 1.5351162216100473e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3564 + }, + { + "completion_length": 838.0, + "epoch": 0.5434451219512195, + "grad_norm": 0.1264790346616455, + "kl": 0.08447265625, + "learning_rate": 1.5343182665656703e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3565 + }, + { + "completion_length": 1459.666748046875, + "epoch": 0.5435975609756097, + "grad_norm": 0.10365007842097806, + "kl": 0.086181640625, + "learning_rate": 1.5335203018042888e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3566 + }, + { + "completion_length": 2423.8333435058594, + "epoch": 0.54375, + "grad_norm": 0.11292978667027839, + "kl": 0.066162109375, + "learning_rate": 1.532722327551842e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3567 + }, + { + "completion_length": 835.6666870117188, + "epoch": 0.5439024390243903, + "grad_norm": 0.12625864583301477, + "kl": 0.0648193359375, + "learning_rate": 1.5319243440342713e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3568 + }, + { + "completion_length": 2768.666748046875, + "epoch": 0.5440548780487805, + "grad_norm": 0.800723702809938, + "kl": 0.07421875, + "learning_rate": 1.5311263514775214e-06, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3569 + }, + { + "completion_length": 2316.5000915527344, + "epoch": 0.5442073170731707, + "grad_norm": 0.09159403840795795, + "kl": 0.075927734375, + "learning_rate": 1.530328350107538e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3570 + }, + { + "completion_length": 2547.5000610351562, + "epoch": 0.5443597560975609, + "grad_norm": 0.06407239517087543, + "kl": 0.0574951171875, + "learning_rate": 1.5295303401502705e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3571 + }, + { + "completion_length": 2539.5, + "epoch": 0.5445121951219513, + "grad_norm": 0.08725170423250395, + "kl": 0.08740234375, + "learning_rate": 1.5287323218316713e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3572 + }, + { + "completion_length": 1177.3333740234375, + "epoch": 0.5446646341463415, + "grad_norm": 0.17383221361764203, + "kl": 0.11474609375, + "learning_rate": 1.5279342953776933e-06, + "loss": 0.0046, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3573 + }, + { + "completion_length": 984.5000305175781, + "epoch": 0.5448170731707317, + "grad_norm": 1.3309981210342001, + "kl": 0.095947265625, + "learning_rate": 1.5271362610142934e-06, + "loss": 0.0038, + "reward": 0.8333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/format_reward": 0.0, + "step": 3574 + }, + { + "completion_length": 1167.3333740234375, + "epoch": 0.5449695121951219, + "grad_norm": 0.12784426103061133, + "kl": 0.10791015625, + "learning_rate": 1.5263382189674294e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3575 + }, + { + "completion_length": 1911.6666870117188, + "epoch": 0.5451219512195122, + "grad_norm": 0.12260947635225951, + "kl": 0.093505859375, + "learning_rate": 1.5255401694630625e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3576 + }, + { + "completion_length": 1764.8333740234375, + "epoch": 0.5452743902439025, + "grad_norm": 0.09601130904585702, + "kl": 0.083740234375, + "learning_rate": 1.5247421127271548e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3577 + }, + { + "completion_length": 1395.5000610351562, + "epoch": 0.5454268292682927, + "grad_norm": 1.0924277355842062, + "kl": 0.0830078125, + "learning_rate": 1.5239440489856724e-06, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3578 + }, + { + "completion_length": 1363.6666870117188, + "epoch": 0.5455792682926829, + "grad_norm": 1.3041648528312848, + "kl": 0.09375, + "learning_rate": 1.5231459784645808e-06, + "loss": 0.0037, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3579 + }, + { + "completion_length": 1918.5000610351562, + "epoch": 0.5457317073170732, + "grad_norm": 1.038985785157561, + "kl": 0.124267578125, + "learning_rate": 1.5223479013898489e-06, + "loss": 0.005, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3580 + }, + { + "completion_length": 1139.6666870117188, + "epoch": 0.5458841463414634, + "grad_norm": 0.11997571115131689, + "kl": 0.1005859375, + "learning_rate": 1.5215498179874483e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3581 + }, + { + "completion_length": 2270.3334350585938, + "epoch": 0.5460365853658536, + "grad_norm": 0.10224941680880155, + "kl": 0.08544921875, + "learning_rate": 1.5207517284833502e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3582 + }, + { + "completion_length": 728.0000305175781, + "epoch": 0.5461890243902439, + "grad_norm": 0.1691855451633089, + "kl": 0.083984375, + "learning_rate": 1.5199536331035291e-06, + "loss": 0.0034, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3583 + }, + { + "completion_length": 1620.3333740234375, + "epoch": 0.5463414634146342, + "grad_norm": 0.0803939799721087, + "kl": 0.072265625, + "learning_rate": 1.5191555320739608e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3584 + }, + { + "completion_length": 1980.6666870117188, + "epoch": 0.5464939024390244, + "grad_norm": 0.10089648307925271, + "kl": 0.1083984375, + "learning_rate": 1.5183574256206225e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3585 + }, + { + "completion_length": 2377.166717529297, + "epoch": 0.5466463414634146, + "grad_norm": 0.09978426584625724, + "kl": 0.079345703125, + "learning_rate": 1.5175593139694931e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3586 + }, + { + "completion_length": 2859.5, + "epoch": 0.5467987804878048, + "grad_norm": 1.5545373018659148, + "kl": 0.0859375, + "learning_rate": 1.5167611973465536e-06, + "loss": 0.0035, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3587 + }, + { + "completion_length": 2766.0001220703125, + "epoch": 0.5469512195121952, + "grad_norm": 0.12204953029119962, + "kl": 0.08837890625, + "learning_rate": 1.5159630759777845e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3588 + }, + { + "completion_length": 778.0, + "epoch": 0.5471036585365854, + "grad_norm": 0.1471455143092437, + "kl": 0.094482421875, + "learning_rate": 1.51516495008917e-06, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3589 + }, + { + "completion_length": 1636.8333740234375, + "epoch": 0.5472560975609756, + "grad_norm": 1.086161847028351, + "kl": 0.0869140625, + "learning_rate": 1.5143668199066938e-06, + "loss": 0.0035, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3590 + }, + { + "completion_length": 2586.166748046875, + "epoch": 0.5474085365853658, + "grad_norm": 1.5367148964881896, + "kl": 0.1015625, + "learning_rate": 1.5135686856563421e-06, + "loss": 0.0041, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3591 + }, + { + "completion_length": 626.5, + "epoch": 0.5475609756097561, + "grad_norm": 0.14935655814479046, + "kl": 0.081298828125, + "learning_rate": 1.5127705475641014e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3592 + }, + { + "completion_length": 2204.5000915527344, + "epoch": 0.5477134146341464, + "grad_norm": 0.3241250392419374, + "kl": 0.069580078125, + "learning_rate": 1.511972405855959e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3593 + }, + { + "completion_length": 2192.0000610351562, + "epoch": 0.5478658536585366, + "grad_norm": 0.07879097396058198, + "kl": 0.0626220703125, + "learning_rate": 1.5111742607579048e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3594 + }, + { + "completion_length": 857.3333435058594, + "epoch": 0.5480182926829268, + "grad_norm": 0.2106065763160806, + "kl": 0.08447265625, + "learning_rate": 1.5103761124959273e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3595 + }, + { + "completion_length": 2719.8333740234375, + "epoch": 0.5481707317073171, + "grad_norm": 0.08248420188626013, + "kl": 0.068603515625, + "learning_rate": 1.5095779612960189e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3596 + }, + { + "completion_length": 1262.3333740234375, + "epoch": 0.5483231707317073, + "grad_norm": 0.12236897841183916, + "kl": 0.074951171875, + "learning_rate": 1.50877980738417e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3597 + }, + { + "completion_length": 1429.5, + "epoch": 0.5484756097560975, + "grad_norm": 1.3497238729458845, + "kl": 0.090087890625, + "learning_rate": 1.5079816509863734e-06, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3598 + }, + { + "completion_length": 1288.3333740234375, + "epoch": 0.5486280487804878, + "grad_norm": 0.14362949718370494, + "kl": 0.143310546875, + "learning_rate": 1.5071834923286215e-06, + "loss": 0.0057, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3599 + }, + { + "completion_length": 1382.1666870117188, + "epoch": 0.5487804878048781, + "grad_norm": 0.24270418173339156, + "kl": 0.13037109375, + "learning_rate": 1.5063853316369081e-06, + "loss": 0.0052, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3600 + }, + { + "completion_length": 1644.0000610351562, + "epoch": 0.5489329268292683, + "grad_norm": 0.1354151800732545, + "kl": 0.08740234375, + "learning_rate": 1.5055871691372282e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3601 + }, + { + "completion_length": 830.3333740234375, + "epoch": 0.5490853658536585, + "grad_norm": 0.18530250952785035, + "kl": 0.10888671875, + "learning_rate": 1.504789005055576e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3602 + }, + { + "completion_length": 1160.6666870117188, + "epoch": 0.5492378048780487, + "grad_norm": 0.11154001493316017, + "kl": 0.07470703125, + "learning_rate": 1.503990839617947e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3603 + }, + { + "completion_length": 2009.8333435058594, + "epoch": 0.5493902439024391, + "grad_norm": 0.15498226927674824, + "kl": 0.0869140625, + "learning_rate": 1.5031926730503356e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3604 + }, + { + "completion_length": 2337.166748046875, + "epoch": 0.5495426829268293, + "grad_norm": 0.07598330609683566, + "kl": 0.0567626953125, + "learning_rate": 1.5023945055787398e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3605 + }, + { + "completion_length": 1968.3334350585938, + "epoch": 0.5496951219512195, + "grad_norm": 0.08466495313471364, + "kl": 0.061767578125, + "learning_rate": 1.501596337429154e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3606 + }, + { + "completion_length": 1315.5000610351562, + "epoch": 0.5498475609756097, + "grad_norm": 0.12338244282986063, + "kl": 0.102294921875, + "learning_rate": 1.5007981688275756e-06, + "loss": 0.0041, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3607 + }, + { + "completion_length": 766.8333435058594, + "epoch": 0.55, + "grad_norm": 0.21968073658285833, + "kl": 0.118408203125, + "learning_rate": 1.5e-06, + "loss": 0.0047, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3608 + }, + { + "completion_length": 1115.6666870117188, + "epoch": 0.5501524390243903, + "grad_norm": 0.21753684364942552, + "kl": 0.13623046875, + "learning_rate": 1.4992018311724247e-06, + "loss": 0.0054, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3609 + }, + { + "completion_length": 1391.1666870117188, + "epoch": 0.5503048780487805, + "grad_norm": 0.09751976127226102, + "kl": 0.088623046875, + "learning_rate": 1.4984036625708463e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3610 + }, + { + "completion_length": 1074.6666870117188, + "epoch": 0.5504573170731707, + "grad_norm": 0.17196739372332362, + "kl": 0.09912109375, + "learning_rate": 1.4976054944212607e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3611 + }, + { + "completion_length": 1762.5000610351562, + "epoch": 0.550609756097561, + "grad_norm": 0.09355278977779803, + "kl": 0.0679931640625, + "learning_rate": 1.4968073269496644e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3612 + }, + { + "completion_length": 1861.3334350585938, + "epoch": 0.5507621951219512, + "grad_norm": 0.10170714614654756, + "kl": 0.087158203125, + "learning_rate": 1.4960091603820537e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3613 + }, + { + "completion_length": 873.0000305175781, + "epoch": 0.5509146341463415, + "grad_norm": 2.892800000264371, + "kl": 0.115234375, + "learning_rate": 1.4952109949444245e-06, + "loss": 0.0046, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3614 + }, + { + "completion_length": 940.3333435058594, + "epoch": 0.5510670731707317, + "grad_norm": 18.452211694029494, + "kl": 0.165283203125, + "learning_rate": 1.4944128308627723e-06, + "loss": 0.0066, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3615 + }, + { + "completion_length": 596.3333435058594, + "epoch": 0.551219512195122, + "grad_norm": 0.22188800847626816, + "kl": 0.089599609375, + "learning_rate": 1.4936146683630921e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3616 + }, + { + "completion_length": 2301.166748046875, + "epoch": 0.5513719512195122, + "grad_norm": 0.1026973848239066, + "kl": 0.0687255859375, + "learning_rate": 1.4928165076713793e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3617 + }, + { + "completion_length": 1482.8333740234375, + "epoch": 0.5515243902439024, + "grad_norm": 0.1349955285403739, + "kl": 0.09716796875, + "learning_rate": 1.492018349013627e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3618 + }, + { + "completion_length": 1992.666748046875, + "epoch": 0.5516768292682926, + "grad_norm": 0.08465481866304884, + "kl": 0.075439453125, + "learning_rate": 1.49122019261583e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3619 + }, + { + "completion_length": 1630.0000915527344, + "epoch": 0.551829268292683, + "grad_norm": 1.2462399170182905, + "kl": 0.1025390625, + "learning_rate": 1.4904220387039814e-06, + "loss": 0.0041, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3620 + }, + { + "completion_length": 1945.666748046875, + "epoch": 0.5519817073170732, + "grad_norm": 0.0768701973924077, + "kl": 0.077880859375, + "learning_rate": 1.4896238875040725e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3621 + }, + { + "completion_length": 3536.5, + "epoch": 0.5521341463414634, + "grad_norm": 0.04069147153808612, + "kl": 0.0592041015625, + "learning_rate": 1.4888257392420953e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3622 + }, + { + "completion_length": 1889.666748046875, + "epoch": 0.5522865853658536, + "grad_norm": 0.0917157050422142, + "kl": 0.079345703125, + "learning_rate": 1.488027594144041e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3623 + }, + { + "completion_length": 2245.666748046875, + "epoch": 0.552439024390244, + "grad_norm": 0.09209774085689405, + "kl": 0.0654296875, + "learning_rate": 1.4872294524358989e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3624 + }, + { + "completion_length": 2196.666748046875, + "epoch": 0.5525914634146342, + "grad_norm": 0.8355881418446383, + "kl": 0.0765380859375, + "learning_rate": 1.4864313143436584e-06, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3625 + }, + { + "completion_length": 949.6667175292969, + "epoch": 0.5527439024390244, + "grad_norm": 0.1435865587635797, + "kl": 0.078857421875, + "learning_rate": 1.4856331800933063e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3626 + }, + { + "completion_length": 783.6666870117188, + "epoch": 0.5528963414634146, + "grad_norm": 0.3920527570296634, + "kl": 0.088134765625, + "learning_rate": 1.4848350499108301e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3627 + }, + { + "completion_length": 2193.3334350585938, + "epoch": 0.5530487804878049, + "grad_norm": 0.0973369585990749, + "kl": 0.0733642578125, + "learning_rate": 1.4840369240222158e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3628 + }, + { + "completion_length": 3114.166748046875, + "epoch": 0.5532012195121951, + "grad_norm": 0.05003434415109831, + "kl": 0.052734375, + "learning_rate": 1.4832388026534467e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3629 + }, + { + "completion_length": 2308.5, + "epoch": 0.5533536585365854, + "grad_norm": 0.05956239748803489, + "kl": 0.05419921875, + "learning_rate": 1.4824406860305071e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3630 + }, + { + "completion_length": 1759.3333435058594, + "epoch": 0.5535060975609756, + "grad_norm": 0.24398473656853126, + "kl": 0.11865234375, + "learning_rate": 1.4816425743793776e-06, + "loss": 0.0047, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3631 + }, + { + "completion_length": 1895.3333435058594, + "epoch": 0.5536585365853659, + "grad_norm": 0.09967686317284156, + "kl": 0.06201171875, + "learning_rate": 1.4808444679260396e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3632 + }, + { + "completion_length": 3084.8333740234375, + "epoch": 0.5538109756097561, + "grad_norm": 0.04994422120159832, + "kl": 0.054931640625, + "learning_rate": 1.4800463668964712e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3633 + }, + { + "completion_length": 2195.6666870117188, + "epoch": 0.5539634146341463, + "grad_norm": 0.11628050551548054, + "kl": 0.07275390625, + "learning_rate": 1.4792482715166501e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3634 + }, + { + "completion_length": 3700.0, + "epoch": 0.5541158536585366, + "grad_norm": 0.03997146362419836, + "kl": 0.0499267578125, + "learning_rate": 1.4784501820125522e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3635 + }, + { + "completion_length": 1132.8333740234375, + "epoch": 0.5542682926829269, + "grad_norm": 0.11773566883440219, + "kl": 0.077880859375, + "learning_rate": 1.4776520986101508e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3636 + }, + { + "completion_length": 2441.6666870117188, + "epoch": 0.5544207317073171, + "grad_norm": 0.06227357923329126, + "kl": 0.059814453125, + "learning_rate": 1.4768540215354192e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3637 + }, + { + "completion_length": 1949.1666870117188, + "epoch": 0.5545731707317073, + "grad_norm": 0.10455584994616691, + "kl": 0.0703125, + "learning_rate": 1.476055951014328e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3638 + }, + { + "completion_length": 1602.5, + "epoch": 0.5547256097560975, + "grad_norm": 0.12585131230673888, + "kl": 0.09814453125, + "learning_rate": 1.4752578872728449e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3639 + }, + { + "completion_length": 1736.0, + "epoch": 0.5548780487804879, + "grad_norm": 0.09036251517540926, + "kl": 0.0850830078125, + "learning_rate": 1.4744598305369376e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3640 + }, + { + "completion_length": 2408.166748046875, + "epoch": 0.5550304878048781, + "grad_norm": 0.05824208688629202, + "kl": 0.054443359375, + "learning_rate": 1.4736617810325709e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3641 + }, + { + "completion_length": 3248.3333740234375, + "epoch": 0.5551829268292683, + "grad_norm": 0.10860153874776199, + "kl": 0.0738525390625, + "learning_rate": 1.4728637389857067e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3642 + }, + { + "completion_length": 1873.0001220703125, + "epoch": 0.5553353658536585, + "grad_norm": 0.08246416214786324, + "kl": 0.081298828125, + "learning_rate": 1.4720657046223068e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3643 + }, + { + "completion_length": 2682.8333740234375, + "epoch": 0.5554878048780488, + "grad_norm": 0.07473785893793529, + "kl": 0.0592041015625, + "learning_rate": 1.4712676781683288e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3644 + }, + { + "completion_length": 3174.5001220703125, + "epoch": 0.555640243902439, + "grad_norm": 0.05057148752792326, + "kl": 0.0506591796875, + "learning_rate": 1.4704696598497296e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3645 + }, + { + "completion_length": 1905.3333740234375, + "epoch": 0.5557926829268293, + "grad_norm": 0.1352894012433721, + "kl": 0.080078125, + "learning_rate": 1.4696716498924623e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3646 + }, + { + "completion_length": 2539.3333740234375, + "epoch": 0.5559451219512195, + "grad_norm": 1.273979050492806, + "kl": 0.056884765625, + "learning_rate": 1.468873648522479e-06, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3647 + }, + { + "completion_length": 2803.6666870117188, + "epoch": 0.5560975609756098, + "grad_norm": 0.31228043889217866, + "kl": 0.081298828125, + "learning_rate": 1.4680756559657292e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3648 + }, + { + "completion_length": 2351.666748046875, + "epoch": 0.55625, + "grad_norm": 0.09870894017792808, + "kl": 0.0673828125, + "learning_rate": 1.4672776724481584e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3649 + }, + { + "completion_length": 1705.166748046875, + "epoch": 0.5564024390243902, + "grad_norm": 0.09510433518006126, + "kl": 0.083984375, + "learning_rate": 1.466479698195712e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3650 + }, + { + "completion_length": 817.0000305175781, + "epoch": 0.5565548780487805, + "grad_norm": 0.13165957541211104, + "kl": 0.061767578125, + "learning_rate": 1.4656817334343305e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3651 + }, + { + "completion_length": 1302.6666870117188, + "epoch": 0.5567073170731708, + "grad_norm": 0.148863692283624, + "kl": 0.110107421875, + "learning_rate": 1.464883778389953e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3652 + }, + { + "completion_length": 586.8333435058594, + "epoch": 0.556859756097561, + "grad_norm": 2.1401893195758017, + "kl": 0.102783203125, + "learning_rate": 1.4640858332885154e-06, + "loss": 0.0041, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3653 + }, + { + "completion_length": 1839.8334350585938, + "epoch": 0.5570121951219512, + "grad_norm": 0.12414701422680946, + "kl": 0.080322265625, + "learning_rate": 1.4632878983559512e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3654 + }, + { + "completion_length": 864.0000610351562, + "epoch": 0.5571646341463414, + "grad_norm": 0.12604005646726138, + "kl": 0.080078125, + "learning_rate": 1.46248997381819e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3655 + }, + { + "completion_length": 1272.6666870117188, + "epoch": 0.5573170731707318, + "grad_norm": 0.23054953162504452, + "kl": 0.1053466796875, + "learning_rate": 1.4616920599011603e-06, + "loss": 0.0042, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3656 + }, + { + "completion_length": 1683.5000915527344, + "epoch": 0.557469512195122, + "grad_norm": 3.102296121410529, + "kl": 0.115234375, + "learning_rate": 1.4608941568307854e-06, + "loss": 0.0046, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3657 + }, + { + "completion_length": 1034.0000610351562, + "epoch": 0.5576219512195122, + "grad_norm": 0.14684075889041306, + "kl": 0.0732421875, + "learning_rate": 1.4600962648329872e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3658 + }, + { + "completion_length": 1721.1666870117188, + "epoch": 0.5577743902439024, + "grad_norm": 0.188453469943027, + "kl": 0.0751953125, + "learning_rate": 1.459298384133684e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3659 + }, + { + "completion_length": 735.1666870117188, + "epoch": 0.5579268292682927, + "grad_norm": 0.14475008414090296, + "kl": 0.076171875, + "learning_rate": 1.4585005149587903e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3660 + }, + { + "completion_length": 887.1666870117188, + "epoch": 0.558079268292683, + "grad_norm": 0.34529253869948867, + "kl": 0.08740234375, + "learning_rate": 1.4577026575342182e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3661 + }, + { + "completion_length": 1741.5, + "epoch": 0.5582317073170732, + "grad_norm": 0.10590574619758177, + "kl": 0.0792236328125, + "learning_rate": 1.4569048120858757e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3662 + }, + { + "completion_length": 1545.8333740234375, + "epoch": 0.5583841463414634, + "grad_norm": 0.15979958832026608, + "kl": 0.103759765625, + "learning_rate": 1.4561069788396678e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3663 + }, + { + "completion_length": 1154.3333740234375, + "epoch": 0.5585365853658537, + "grad_norm": 0.10639374129231026, + "kl": 0.0616455078125, + "learning_rate": 1.4553091580214963e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3664 + }, + { + "completion_length": 549.3333740234375, + "epoch": 0.5586890243902439, + "grad_norm": 0.21535437078106345, + "kl": 0.1041259765625, + "learning_rate": 1.454511349857259e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3665 + }, + { + "completion_length": 656.0, + "epoch": 0.5588414634146341, + "grad_norm": 0.286487788745412, + "kl": 0.20703125, + "learning_rate": 1.4537135545728507e-06, + "loss": 0.0083, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3666 + }, + { + "completion_length": 2047.166748046875, + "epoch": 0.5589939024390244, + "grad_norm": 0.14037548434961858, + "kl": 0.080810546875, + "learning_rate": 1.4529157723941612e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3667 + }, + { + "completion_length": 2580.166748046875, + "epoch": 0.5591463414634147, + "grad_norm": 0.0731706191698198, + "kl": 0.05859375, + "learning_rate": 1.452118003547079e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3668 + }, + { + "completion_length": 2588.1666870117188, + "epoch": 0.5592987804878049, + "grad_norm": 0.08524200653185462, + "kl": 0.076171875, + "learning_rate": 1.4513202482574863e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3669 + }, + { + "completion_length": 2195.666748046875, + "epoch": 0.5594512195121951, + "grad_norm": 0.06501880428400902, + "kl": 0.07080078125, + "learning_rate": 1.450522506751263e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3670 + }, + { + "completion_length": 1373.5, + "epoch": 0.5596036585365853, + "grad_norm": 0.11066455606559665, + "kl": 0.079833984375, + "learning_rate": 1.4497247792542844e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3671 + }, + { + "completion_length": 1231.8333435058594, + "epoch": 0.5597560975609757, + "grad_norm": 2.637939627849036, + "kl": 0.093505859375, + "learning_rate": 1.4489270659924222e-06, + "loss": 0.0037, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3672 + }, + { + "completion_length": 1703.5000610351562, + "epoch": 0.5599085365853659, + "grad_norm": 0.10980035908129229, + "kl": 0.086669921875, + "learning_rate": 1.4481293671915434e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3673 + }, + { + "completion_length": 1886.0000610351562, + "epoch": 0.5600609756097561, + "grad_norm": 0.1859678202632946, + "kl": 0.09033203125, + "learning_rate": 1.447331683077513e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3674 + }, + { + "completion_length": 1964.3333740234375, + "epoch": 0.5602134146341463, + "grad_norm": 0.11328652443331787, + "kl": 0.064208984375, + "learning_rate": 1.446534013876189e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3675 + }, + { + "completion_length": 1571.0000610351562, + "epoch": 0.5603658536585366, + "grad_norm": 0.10426070092137588, + "kl": 0.093017578125, + "learning_rate": 1.4457363598134272e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3676 + }, + { + "completion_length": 2055.3333740234375, + "epoch": 0.5605182926829269, + "grad_norm": 0.09281218483912701, + "kl": 0.079345703125, + "learning_rate": 1.4449387211150774e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3677 + }, + { + "completion_length": 1116.6666870117188, + "epoch": 0.5606707317073171, + "grad_norm": 0.09700730475037866, + "kl": 0.081298828125, + "learning_rate": 1.444141098006987e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3678 + }, + { + "completion_length": 1473.666748046875, + "epoch": 0.5608231707317073, + "grad_norm": 1.492397936198045, + "kl": 0.079345703125, + "learning_rate": 1.4433434907149977e-06, + "loss": 0.0032, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3679 + }, + { + "completion_length": 895.3333740234375, + "epoch": 0.5609756097560976, + "grad_norm": 0.10312383975330963, + "kl": 0.0615234375, + "learning_rate": 1.442545899464947e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3680 + }, + { + "completion_length": 1213.6666870117188, + "epoch": 0.5611280487804878, + "grad_norm": 0.1639278340689616, + "kl": 0.08544921875, + "learning_rate": 1.441748324482668e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3681 + }, + { + "completion_length": 913.5000610351562, + "epoch": 0.561280487804878, + "grad_norm": 0.16494274943933165, + "kl": 0.0767822265625, + "learning_rate": 1.440950765993989e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3682 + }, + { + "completion_length": 1405.0, + "epoch": 0.5614329268292683, + "grad_norm": 2.464349629388223, + "kl": 0.091796875, + "learning_rate": 1.440153224224734e-06, + "loss": 0.0037, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3683 + }, + { + "completion_length": 2213.5000610351562, + "epoch": 0.5615853658536586, + "grad_norm": 0.21212020304872828, + "kl": 0.10400390625, + "learning_rate": 1.4393556994007214e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3684 + }, + { + "completion_length": 960.6666870117188, + "epoch": 0.5617378048780488, + "grad_norm": 0.10708529034673986, + "kl": 0.0811767578125, + "learning_rate": 1.4385581917477657e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3685 + }, + { + "completion_length": 1152.3333740234375, + "epoch": 0.561890243902439, + "grad_norm": 0.16627868876469784, + "kl": 0.080810546875, + "learning_rate": 1.4377607014916769e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3686 + }, + { + "completion_length": 1063.6666870117188, + "epoch": 0.5620426829268292, + "grad_norm": 0.11716947485695536, + "kl": 0.062255859375, + "learning_rate": 1.4369632288582582e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3687 + }, + { + "completion_length": 1469.6667175292969, + "epoch": 0.5621951219512196, + "grad_norm": 1.7701070609056635, + "kl": 0.09619140625, + "learning_rate": 1.4361657740733103e-06, + "loss": 0.0038, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 3688 + }, + { + "completion_length": 1010.0, + "epoch": 0.5623475609756098, + "grad_norm": 0.11301862387185782, + "kl": 0.071044921875, + "learning_rate": 1.4353683373626263e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3689 + }, + { + "completion_length": 1170.8333740234375, + "epoch": 0.5625, + "grad_norm": 17.48064311110984, + "kl": 0.176025390625, + "learning_rate": 1.4345709189519962e-06, + "loss": 0.0071, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3690 + }, + { + "completion_length": 1039.8333740234375, + "epoch": 0.5626524390243902, + "grad_norm": 1.775065732203127, + "kl": 0.1259765625, + "learning_rate": 1.4337735190672043e-06, + "loss": 0.0051, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3691 + }, + { + "completion_length": 2267.166748046875, + "epoch": 0.5628048780487804, + "grad_norm": 0.09156539773029758, + "kl": 0.073974609375, + "learning_rate": 1.4329761379340283e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3692 + }, + { + "completion_length": 3440.666748046875, + "epoch": 0.5629573170731708, + "grad_norm": 0.09233805398082753, + "kl": 0.06689453125, + "learning_rate": 1.4321787757782427e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3693 + }, + { + "completion_length": 1230.3333740234375, + "epoch": 0.563109756097561, + "grad_norm": 0.4297838640461095, + "kl": 0.08544921875, + "learning_rate": 1.4313814328256159e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3694 + }, + { + "completion_length": 1635.8333435058594, + "epoch": 0.5632621951219512, + "grad_norm": 2.0092082207784663, + "kl": 0.11279296875, + "learning_rate": 1.43058410930191e-06, + "loss": 0.0045, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3695 + }, + { + "completion_length": 1057.8333740234375, + "epoch": 0.5634146341463414, + "grad_norm": 2.197356564590612, + "kl": 0.1162109375, + "learning_rate": 1.429786805432882e-06, + "loss": 0.0046, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 3696 + }, + { + "completion_length": 1651.6666870117188, + "epoch": 0.5635670731707317, + "grad_norm": 0.1057419693901072, + "kl": 0.074462890625, + "learning_rate": 1.4289895214442842e-06, + "loss": 0.003, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3697 + }, + { + "completion_length": 2240.666748046875, + "epoch": 0.563719512195122, + "grad_norm": 0.08642876417644185, + "kl": 0.06298828125, + "learning_rate": 1.4281922575618623e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3698 + }, + { + "completion_length": 1377.3333740234375, + "epoch": 0.5638719512195122, + "grad_norm": 0.12919220018694616, + "kl": 0.08642578125, + "learning_rate": 1.4273950140113564e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3699 + }, + { + "completion_length": 1792.666748046875, + "epoch": 0.5640243902439024, + "grad_norm": 0.20551824257178355, + "kl": 0.10546875, + "learning_rate": 1.4265977910185013e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3700 + }, + { + "completion_length": 1368.8333435058594, + "epoch": 0.5641768292682927, + "grad_norm": 0.19782735072735613, + "kl": 0.083740234375, + "learning_rate": 1.4258005888090257e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3701 + }, + { + "completion_length": 1474.3333740234375, + "epoch": 0.5643292682926829, + "grad_norm": 0.2171518278221714, + "kl": 0.129638671875, + "learning_rate": 1.4250034076086523e-06, + "loss": 0.0052, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3702 + }, + { + "completion_length": 1778.0000915527344, + "epoch": 0.5644817073170731, + "grad_norm": 0.41886164061290365, + "kl": 0.07275390625, + "learning_rate": 1.424206247643098e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3703 + }, + { + "completion_length": 1803.166748046875, + "epoch": 0.5646341463414634, + "grad_norm": 0.09235182679224875, + "kl": 0.0791015625, + "learning_rate": 1.4234091091380743e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3704 + }, + { + "completion_length": 2453.6666870117188, + "epoch": 0.5647865853658537, + "grad_norm": 0.11482454131015996, + "kl": 0.083984375, + "learning_rate": 1.422611992319285e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3705 + }, + { + "completion_length": 1020.0000610351562, + "epoch": 0.5649390243902439, + "grad_norm": 0.8735936314342352, + "kl": 0.12353515625, + "learning_rate": 1.42181489741243e-06, + "loss": 0.0049, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3706 + }, + { + "completion_length": 1152.8333740234375, + "epoch": 0.5650914634146341, + "grad_norm": 1.353616695756608, + "kl": 0.0777587890625, + "learning_rate": 1.4210178246432007e-06, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3707 + }, + { + "completion_length": 1827.3333740234375, + "epoch": 0.5652439024390243, + "grad_norm": 0.0791068549044405, + "kl": 0.072509765625, + "learning_rate": 1.420220774237284e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3708 + }, + { + "completion_length": 1576.0000610351562, + "epoch": 0.5653963414634147, + "grad_norm": 0.08359633648763419, + "kl": 0.0791015625, + "learning_rate": 1.419423746420359e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3709 + }, + { + "completion_length": 996.8333740234375, + "epoch": 0.5655487804878049, + "grad_norm": 0.13555772353201787, + "kl": 0.094970703125, + "learning_rate": 1.4186267414180992e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3710 + }, + { + "completion_length": 715.3333435058594, + "epoch": 0.5657012195121951, + "grad_norm": 0.11709535538808426, + "kl": 0.085693359375, + "learning_rate": 1.4178297594561726e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3711 + }, + { + "completion_length": 781.5000305175781, + "epoch": 0.5658536585365853, + "grad_norm": 0.19032390924172035, + "kl": 0.097900390625, + "learning_rate": 1.4170328007602395e-06, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3712 + }, + { + "completion_length": 1567.3333740234375, + "epoch": 0.5660060975609756, + "grad_norm": 0.21206220656173397, + "kl": 0.109375, + "learning_rate": 1.416235865555953e-06, + "loss": 0.0044, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3713 + }, + { + "completion_length": 1114.1667175292969, + "epoch": 0.5661585365853659, + "grad_norm": 0.10338907797070149, + "kl": 0.080810546875, + "learning_rate": 1.415438954068961e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3714 + }, + { + "completion_length": 1935.0, + "epoch": 0.5663109756097561, + "grad_norm": 0.10252665847486123, + "kl": 0.084228515625, + "learning_rate": 1.4146420665249037e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3715 + }, + { + "completion_length": 1838.3333740234375, + "epoch": 0.5664634146341463, + "grad_norm": 0.06831552031025567, + "kl": 0.0621337890625, + "learning_rate": 1.4138452031494152e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3716 + }, + { + "completion_length": 1855.666748046875, + "epoch": 0.5666158536585366, + "grad_norm": 1.7040646041333214, + "kl": 0.0947265625, + "learning_rate": 1.4130483641681222e-06, + "loss": 0.0038, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3717 + }, + { + "completion_length": 670.3333435058594, + "epoch": 0.5667682926829268, + "grad_norm": 0.16268837905135153, + "kl": 0.117919921875, + "learning_rate": 1.4122515498066446e-06, + "loss": 0.0047, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3718 + }, + { + "completion_length": 1960.3333740234375, + "epoch": 0.566920731707317, + "grad_norm": 0.07248758748538865, + "kl": 0.0677490234375, + "learning_rate": 1.4114547602905956e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3719 + }, + { + "completion_length": 1426.3333435058594, + "epoch": 0.5670731707317073, + "grad_norm": 0.14955517864050794, + "kl": 0.077392578125, + "learning_rate": 1.4106579958455812e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3720 + }, + { + "completion_length": 2099.0, + "epoch": 0.5672256097560976, + "grad_norm": 0.1215715133043059, + "kl": 0.099365234375, + "learning_rate": 1.4098612566972e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3721 + }, + { + "completion_length": 1662.5001220703125, + "epoch": 0.5673780487804878, + "grad_norm": 0.12454519592948265, + "kl": 0.0849609375, + "learning_rate": 1.4090645430710445e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3722 + }, + { + "completion_length": 1616.1667175292969, + "epoch": 0.567530487804878, + "grad_norm": 0.12159131790643785, + "kl": 0.08642578125, + "learning_rate": 1.4082678551926982e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3723 + }, + { + "completion_length": 909.0000305175781, + "epoch": 0.5676829268292682, + "grad_norm": 0.17869768799174757, + "kl": 0.09375, + "learning_rate": 1.4074711932877393e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3724 + }, + { + "completion_length": 1101.6666870117188, + "epoch": 0.5678353658536586, + "grad_norm": 0.1154547848768028, + "kl": 0.08056640625, + "learning_rate": 1.4066745575817369e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3725 + }, + { + "completion_length": 962.3333740234375, + "epoch": 0.5679878048780488, + "grad_norm": 0.14033039343379355, + "kl": 0.103759765625, + "learning_rate": 1.405877948300254e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3726 + }, + { + "completion_length": 1133.3333740234375, + "epoch": 0.568140243902439, + "grad_norm": 0.13485206495451943, + "kl": 0.100341796875, + "learning_rate": 1.4050813656688451e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3727 + }, + { + "completion_length": 1250.5000610351562, + "epoch": 0.5682926829268292, + "grad_norm": 0.12953582721861548, + "kl": 0.089599609375, + "learning_rate": 1.4042848099130574e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3728 + }, + { + "completion_length": 1186.0000610351562, + "epoch": 0.5684451219512195, + "grad_norm": 0.08250266744073921, + "kl": 0.0623779296875, + "learning_rate": 1.4034882812584316e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3729 + }, + { + "completion_length": 1347.8333740234375, + "epoch": 0.5685975609756098, + "grad_norm": 0.09686114142139882, + "kl": 0.09228515625, + "learning_rate": 1.4026917799304994e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3730 + }, + { + "completion_length": 1626.3333740234375, + "epoch": 0.56875, + "grad_norm": 1.350149667843165, + "kl": 0.0804443359375, + "learning_rate": 1.4018953061547853e-06, + "loss": 0.0032, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3731 + }, + { + "completion_length": 925.0, + "epoch": 0.5689024390243902, + "grad_norm": 0.12169565797471298, + "kl": 0.093017578125, + "learning_rate": 1.401098860156806e-06, + "loss": 0.0037, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3732 + }, + { + "completion_length": 1528.6666870117188, + "epoch": 0.5690548780487805, + "grad_norm": 0.2994520983538191, + "kl": 0.10693359375, + "learning_rate": 1.4003024421620697e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3733 + }, + { + "completion_length": 1264.0, + "epoch": 0.5692073170731707, + "grad_norm": 0.24784614285733936, + "kl": 0.1484375, + "learning_rate": 1.3995060523960775e-06, + "loss": 0.0059, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3734 + }, + { + "completion_length": 962.0, + "epoch": 0.569359756097561, + "grad_norm": 0.08367783865223054, + "kl": 0.070068359375, + "learning_rate": 1.3987096910843227e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3735 + }, + { + "completion_length": 1342.1666870117188, + "epoch": 0.5695121951219512, + "grad_norm": 0.12947120297766548, + "kl": 0.11328125, + "learning_rate": 1.3979133584522893e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3736 + }, + { + "completion_length": 1184.0000305175781, + "epoch": 0.5696646341463415, + "grad_norm": 0.11652066770292618, + "kl": 0.0721435546875, + "learning_rate": 1.3971170547254547e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3737 + }, + { + "completion_length": 1152.1666870117188, + "epoch": 0.5698170731707317, + "grad_norm": 0.13097490598940403, + "kl": 0.08349609375, + "learning_rate": 1.3963207801292865e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3738 + }, + { + "completion_length": 884.6666870117188, + "epoch": 0.5699695121951219, + "grad_norm": 0.1412804039778707, + "kl": 0.0869140625, + "learning_rate": 1.3955245348892456e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3739 + }, + { + "completion_length": 1435.8333435058594, + "epoch": 0.5701219512195121, + "grad_norm": 1.8986684456672043, + "kl": 0.10888671875, + "learning_rate": 1.3947283192307831e-06, + "loss": 0.0044, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3740 + }, + { + "completion_length": 963.8333740234375, + "epoch": 0.5702743902439025, + "grad_norm": 0.07826500446375102, + "kl": 0.056640625, + "learning_rate": 1.3939321333793432e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3741 + }, + { + "completion_length": 732.8333435058594, + "epoch": 0.5704268292682927, + "grad_norm": 2.179934559111433, + "kl": 0.114990234375, + "learning_rate": 1.3931359775603612e-06, + "loss": 0.0046, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3742 + }, + { + "completion_length": 724.6666870117188, + "epoch": 0.5705792682926829, + "grad_norm": 0.15311011065019559, + "kl": 0.0911865234375, + "learning_rate": 1.392339851999263e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3743 + }, + { + "completion_length": 1203.8333740234375, + "epoch": 0.5707317073170731, + "grad_norm": 0.2567687894472231, + "kl": 0.103759765625, + "learning_rate": 1.391543756921467e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3744 + }, + { + "completion_length": 764.0000305175781, + "epoch": 0.5708841463414634, + "grad_norm": 0.1171727315708632, + "kl": 0.079345703125, + "learning_rate": 1.3907476925523825e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3745 + }, + { + "completion_length": 1366.3333435058594, + "epoch": 0.5710365853658537, + "grad_norm": 2.3783075643194036, + "kl": 0.085205078125, + "learning_rate": 1.3899516591174101e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3746 + }, + { + "completion_length": 2552.8333740234375, + "epoch": 0.5711890243902439, + "grad_norm": 0.0915784992359268, + "kl": 0.0697021484375, + "learning_rate": 1.389155656841941e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3747 + }, + { + "completion_length": 703.6666870117188, + "epoch": 0.5713414634146341, + "grad_norm": 0.1901858708160516, + "kl": 0.08447265625, + "learning_rate": 1.38835968595136e-06, + "loss": 0.0034, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3748 + }, + { + "completion_length": 961.1666870117188, + "epoch": 0.5714939024390244, + "grad_norm": 0.2005959775212189, + "kl": 0.106689453125, + "learning_rate": 1.38756374667104e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3749 + }, + { + "completion_length": 799.8333740234375, + "epoch": 0.5716463414634146, + "grad_norm": 0.1439217255388517, + "kl": 0.11279296875, + "learning_rate": 1.3867678392263472e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3750 + }, + { + "completion_length": 842.0, + "epoch": 0.5717987804878049, + "grad_norm": 0.16344180513310155, + "kl": 0.100341796875, + "learning_rate": 1.3859719638426368e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3751 + }, + { + "completion_length": 1839.6666870117188, + "epoch": 0.5719512195121951, + "grad_norm": 0.13190438370148402, + "kl": 0.08935546875, + "learning_rate": 1.3851761207452565e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3752 + }, + { + "completion_length": 1108.8333740234375, + "epoch": 0.5721036585365854, + "grad_norm": 0.07866980080105462, + "kl": 0.0565185546875, + "learning_rate": 1.3843803101595446e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3753 + }, + { + "completion_length": 1242.0000610351562, + "epoch": 0.5722560975609756, + "grad_norm": 0.29406956317631905, + "kl": 0.1142578125, + "learning_rate": 1.3835845323108293e-06, + "loss": 0.0046, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3754 + }, + { + "completion_length": 1462.8333740234375, + "epoch": 0.5724085365853658, + "grad_norm": 0.09036248741416328, + "kl": 0.0750732421875, + "learning_rate": 1.3827887874244306e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3755 + }, + { + "completion_length": 1195.1666870117188, + "epoch": 0.572560975609756, + "grad_norm": 1.4471014878459876, + "kl": 0.10400390625, + "learning_rate": 1.3819930757256585e-06, + "loss": 0.0042, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3756 + }, + { + "completion_length": 801.3333740234375, + "epoch": 0.5727134146341464, + "grad_norm": 0.19486682168656955, + "kl": 0.07958984375, + "learning_rate": 1.3811973974398142e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3757 + }, + { + "completion_length": 1226.5000610351562, + "epoch": 0.5728658536585366, + "grad_norm": 2.137617532436886, + "kl": 0.131591796875, + "learning_rate": 1.3804017527921884e-06, + "loss": 0.0053, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3758 + }, + { + "completion_length": 1186.6666870117188, + "epoch": 0.5730182926829268, + "grad_norm": 0.09734787555265362, + "kl": 0.073486328125, + "learning_rate": 1.3796061420080635e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3759 + }, + { + "completion_length": 919.6666870117188, + "epoch": 0.573170731707317, + "grad_norm": 0.1044188634420962, + "kl": 0.075439453125, + "learning_rate": 1.3788105653127118e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3760 + }, + { + "completion_length": 986.0, + "epoch": 0.5733231707317074, + "grad_norm": 0.10496524509651008, + "kl": 0.067626953125, + "learning_rate": 1.3780150229313951e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3761 + }, + { + "completion_length": 610.8333435058594, + "epoch": 0.5734756097560976, + "grad_norm": 0.1767694081820491, + "kl": 0.093017578125, + "learning_rate": 1.3772195150893676e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3762 + }, + { + "completion_length": 1408.1666870117188, + "epoch": 0.5736280487804878, + "grad_norm": 0.14915733314027868, + "kl": 0.08154296875, + "learning_rate": 1.376424042011871e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3763 + }, + { + "completion_length": 826.3333740234375, + "epoch": 0.573780487804878, + "grad_norm": 0.3587822229477372, + "kl": 0.0927734375, + "learning_rate": 1.3756286039241397e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3764 + }, + { + "completion_length": 1437.5000610351562, + "epoch": 0.5739329268292683, + "grad_norm": 0.07365233381003486, + "kl": 0.062255859375, + "learning_rate": 1.3748332010513956e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3765 + }, + { + "completion_length": 832.0000305175781, + "epoch": 0.5740853658536585, + "grad_norm": 0.13517309203988073, + "kl": 0.09521484375, + "learning_rate": 1.3740378336188541e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3766 + }, + { + "completion_length": 1011.8333740234375, + "epoch": 0.5742378048780488, + "grad_norm": 0.11092742279036295, + "kl": 0.0732421875, + "learning_rate": 1.373242501851717e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3767 + }, + { + "completion_length": 1364.1666870117188, + "epoch": 0.574390243902439, + "grad_norm": 0.11433831950186718, + "kl": 0.087646484375, + "learning_rate": 1.3724472059751785e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3768 + }, + { + "completion_length": 1089.666748046875, + "epoch": 0.5745426829268293, + "grad_norm": 0.09225373281909086, + "kl": 0.08642578125, + "learning_rate": 1.3716519462144208e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3769 + }, + { + "completion_length": 788.1666870117188, + "epoch": 0.5746951219512195, + "grad_norm": 0.16646750383378267, + "kl": 0.1298828125, + "learning_rate": 1.3708567227946177e-06, + "loss": 0.0052, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3770 + }, + { + "completion_length": 1290.5000610351562, + "epoch": 0.5748475609756097, + "grad_norm": 0.1103277936850358, + "kl": 0.09130859375, + "learning_rate": 1.3700615359409307e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3771 + }, + { + "completion_length": 1414.5, + "epoch": 0.575, + "grad_norm": 0.1457424380972146, + "kl": 0.1142578125, + "learning_rate": 1.3692663858785126e-06, + "loss": 0.0046, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3772 + }, + { + "completion_length": 1234.1666870117188, + "epoch": 0.5751524390243903, + "grad_norm": 0.07795613727702781, + "kl": 0.057861328125, + "learning_rate": 1.3684712728325058e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3773 + }, + { + "completion_length": 753.1666870117188, + "epoch": 0.5753048780487805, + "grad_norm": 0.13687951764148146, + "kl": 0.124267578125, + "learning_rate": 1.3676761970280404e-06, + "loss": 0.005, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3774 + }, + { + "completion_length": 2051.8334350585938, + "epoch": 0.5754573170731707, + "grad_norm": 0.07929196205985004, + "kl": 0.0733642578125, + "learning_rate": 1.366881158690238e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3775 + }, + { + "completion_length": 1089.666748046875, + "epoch": 0.5756097560975609, + "grad_norm": 0.1534643895277648, + "kl": 0.092529296875, + "learning_rate": 1.3660861580442087e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3776 + }, + { + "completion_length": 858.8333435058594, + "epoch": 0.5757621951219513, + "grad_norm": 0.09403801467313662, + "kl": 0.0689697265625, + "learning_rate": 1.365291195315052e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3777 + }, + { + "completion_length": 1100.5000610351562, + "epoch": 0.5759146341463415, + "grad_norm": 0.09319750410930401, + "kl": 0.0654296875, + "learning_rate": 1.3644962707278559e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3778 + }, + { + "completion_length": 955.5, + "epoch": 0.5760670731707317, + "grad_norm": 0.12550414335480276, + "kl": 0.0831298828125, + "learning_rate": 1.3637013845076992e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3779 + }, + { + "completion_length": 1793.0, + "epoch": 0.5762195121951219, + "grad_norm": 0.0885401586945874, + "kl": 0.072998046875, + "learning_rate": 1.3629065368796491e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3780 + }, + { + "completion_length": 1389.8333740234375, + "epoch": 0.5763719512195122, + "grad_norm": 0.15500845087058537, + "kl": 0.074462890625, + "learning_rate": 1.362111728068761e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3781 + }, + { + "completion_length": 1857.3333740234375, + "epoch": 0.5765243902439025, + "grad_norm": 2.1951741491292482, + "kl": 0.083251953125, + "learning_rate": 1.3613169583000806e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3782 + }, + { + "completion_length": 1005.6667175292969, + "epoch": 0.5766768292682927, + "grad_norm": 0.11361913883471252, + "kl": 0.0704345703125, + "learning_rate": 1.360522227798642e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3783 + }, + { + "completion_length": 974.5, + "epoch": 0.5768292682926829, + "grad_norm": 0.10191128601748939, + "kl": 0.080810546875, + "learning_rate": 1.3597275367894676e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3784 + }, + { + "completion_length": 908.8333435058594, + "epoch": 0.5769817073170732, + "grad_norm": 0.11600990899721392, + "kl": 0.076904296875, + "learning_rate": 1.35893288549757e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3785 + }, + { + "completion_length": 1220.8333740234375, + "epoch": 0.5771341463414634, + "grad_norm": 0.12696922738480562, + "kl": 0.093017578125, + "learning_rate": 1.35813827414795e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3786 + }, + { + "completion_length": 1795.1666870117188, + "epoch": 0.5772865853658536, + "grad_norm": 0.36040841241803334, + "kl": 0.091552734375, + "learning_rate": 1.357343702965596e-06, + "loss": 0.0037, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3787 + }, + { + "completion_length": 1201.8333435058594, + "epoch": 0.5774390243902439, + "grad_norm": 0.08028127685460365, + "kl": 0.0572509765625, + "learning_rate": 1.3565491721754867e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3788 + }, + { + "completion_length": 1357.5000305175781, + "epoch": 0.5775914634146342, + "grad_norm": 0.5411638054328771, + "kl": 0.07470703125, + "learning_rate": 1.3557546820025878e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3789 + }, + { + "completion_length": 1391.6667175292969, + "epoch": 0.5777439024390244, + "grad_norm": 0.09946610323890707, + "kl": 0.07470703125, + "learning_rate": 1.3549602326718546e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3790 + }, + { + "completion_length": 1737.666748046875, + "epoch": 0.5778963414634146, + "grad_norm": 0.07508318217697789, + "kl": 0.0489501953125, + "learning_rate": 1.3541658244082307e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3791 + }, + { + "completion_length": 2123.0001220703125, + "epoch": 0.5780487804878048, + "grad_norm": 0.11575968101592377, + "kl": 0.075927734375, + "learning_rate": 1.3533714574366473e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3792 + }, + { + "completion_length": 1825.0, + "epoch": 0.5782012195121952, + "grad_norm": 0.09159666182958058, + "kl": 0.0643310546875, + "learning_rate": 1.3525771319820254e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3793 + }, + { + "completion_length": 1514.8333740234375, + "epoch": 0.5783536585365854, + "grad_norm": 0.07492108736583779, + "kl": 0.077392578125, + "learning_rate": 1.3517828482692724e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3794 + }, + { + "completion_length": 1255.1666870117188, + "epoch": 0.5785060975609756, + "grad_norm": 0.095641213027701, + "kl": 0.101318359375, + "learning_rate": 1.3509886065232851e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3795 + }, + { + "completion_length": 1621.0, + "epoch": 0.5786585365853658, + "grad_norm": 0.09490356437827854, + "kl": 0.0755615234375, + "learning_rate": 1.3501944069689483e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3796 + }, + { + "completion_length": 1158.8333740234375, + "epoch": 0.5788109756097561, + "grad_norm": 0.0955086617917695, + "kl": 0.074951171875, + "learning_rate": 1.3494002498311342e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3797 + }, + { + "completion_length": 850.8333435058594, + "epoch": 0.5789634146341464, + "grad_norm": 0.09886344111067558, + "kl": 0.0526123046875, + "learning_rate": 1.3486061353347042e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3798 + }, + { + "completion_length": 1314.8333740234375, + "epoch": 0.5791158536585366, + "grad_norm": 0.09798349271270783, + "kl": 0.083740234375, + "learning_rate": 1.3478120637045064e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3799 + }, + { + "completion_length": 1529.5, + "epoch": 0.5792682926829268, + "grad_norm": 0.08305174838824259, + "kl": 0.057861328125, + "learning_rate": 1.3470180351653773e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3800 + }, + { + "completion_length": 1724.8334350585938, + "epoch": 0.5794207317073171, + "grad_norm": 2.3784769167002775, + "kl": 0.1083984375, + "learning_rate": 1.346224049942141e-06, + "loss": 0.0044, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3801 + }, + { + "completion_length": 1065.1666870117188, + "epoch": 0.5795731707317073, + "grad_norm": 0.1915873475974972, + "kl": 0.09326171875, + "learning_rate": 1.3454301082596095e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3802 + }, + { + "completion_length": 1049.1667175292969, + "epoch": 0.5797256097560975, + "grad_norm": 0.11545058629762903, + "kl": 0.076171875, + "learning_rate": 1.344636210342583e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3803 + }, + { + "completion_length": 799.8333740234375, + "epoch": 0.5798780487804878, + "grad_norm": 0.14117474498950497, + "kl": 0.0745849609375, + "learning_rate": 1.3438423564158484e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3804 + }, + { + "completion_length": 1371.8333740234375, + "epoch": 0.5800304878048781, + "grad_norm": 0.15419273602220956, + "kl": 0.062744140625, + "learning_rate": 1.3430485467041802e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3805 + }, + { + "completion_length": 953.6667175292969, + "epoch": 0.5801829268292683, + "grad_norm": 2.140248722811776, + "kl": 0.07421875, + "learning_rate": 1.3422547814323413e-06, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3806 + }, + { + "completion_length": 2504.0, + "epoch": 0.5803353658536585, + "grad_norm": 0.09403053490728575, + "kl": 0.0616455078125, + "learning_rate": 1.3414610608250808e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3807 + }, + { + "completion_length": 1170.0000610351562, + "epoch": 0.5804878048780487, + "grad_norm": 0.1111241675377519, + "kl": 0.073486328125, + "learning_rate": 1.3406673851071362e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3808 + }, + { + "completion_length": 1027.5000305175781, + "epoch": 0.5806402439024391, + "grad_norm": 0.11322932812997714, + "kl": 0.080322265625, + "learning_rate": 1.3398737545032318e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3809 + }, + { + "completion_length": 1099.3333435058594, + "epoch": 0.5807926829268293, + "grad_norm": 0.1732890897588042, + "kl": 0.0673828125, + "learning_rate": 1.339080169238079e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3810 + }, + { + "completion_length": 955.1666870117188, + "epoch": 0.5809451219512195, + "grad_norm": 0.1267440583310601, + "kl": 0.06298828125, + "learning_rate": 1.3382866295363767e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3811 + }, + { + "completion_length": 2055.0001220703125, + "epoch": 0.5810975609756097, + "grad_norm": 0.08464963429561718, + "kl": 0.072998046875, + "learning_rate": 1.3374931356228103e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3812 + }, + { + "completion_length": 2363.0001220703125, + "epoch": 0.58125, + "grad_norm": 0.0518876248549513, + "kl": 0.0576171875, + "learning_rate": 1.3366996877220535e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3813 + }, + { + "completion_length": 771.6666870117188, + "epoch": 0.5814024390243903, + "grad_norm": 0.20494361486732887, + "kl": 0.11279296875, + "learning_rate": 1.3359062860587652e-06, + "loss": 0.0045, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3814 + }, + { + "completion_length": 745.3333435058594, + "epoch": 0.5815548780487805, + "grad_norm": 0.12084590465721416, + "kl": 0.1083984375, + "learning_rate": 1.3351129308575925e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3815 + }, + { + "completion_length": 1345.3333740234375, + "epoch": 0.5817073170731707, + "grad_norm": 0.10714355787145097, + "kl": 0.088623046875, + "learning_rate": 1.3343196223431698e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3816 + }, + { + "completion_length": 1602.0000915527344, + "epoch": 0.581859756097561, + "grad_norm": 0.0779035145712789, + "kl": 0.061279296875, + "learning_rate": 1.3335263607401161e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3817 + }, + { + "completion_length": 2244.3334350585938, + "epoch": 0.5820121951219512, + "grad_norm": 0.7273533045626083, + "kl": 0.104248046875, + "learning_rate": 1.3327331462730397e-06, + "loss": 0.0042, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3818 + }, + { + "completion_length": 1232.0, + "epoch": 0.5821646341463415, + "grad_norm": 0.10781243802700505, + "kl": 0.0638427734375, + "learning_rate": 1.3319399791665336e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3819 + }, + { + "completion_length": 2287.8333740234375, + "epoch": 0.5823170731707317, + "grad_norm": 0.07724838522410081, + "kl": 0.067138671875, + "learning_rate": 1.3311468596451785e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3820 + }, + { + "completion_length": 1072.3333740234375, + "epoch": 0.582469512195122, + "grad_norm": 0.12225451633144187, + "kl": 0.07958984375, + "learning_rate": 1.33035378793354e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3821 + }, + { + "completion_length": 1005.0000610351562, + "epoch": 0.5826219512195122, + "grad_norm": 0.10143191271901707, + "kl": 0.087890625, + "learning_rate": 1.3295607642561738e-06, + "loss": 0.0035, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3822 + }, + { + "completion_length": 1149.5000610351562, + "epoch": 0.5827743902439024, + "grad_norm": 0.10044615582423316, + "kl": 0.0849609375, + "learning_rate": 1.328767788837618e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3823 + }, + { + "completion_length": 2044.8334350585938, + "epoch": 0.5829268292682926, + "grad_norm": 0.057148588475730906, + "kl": 0.0625, + "learning_rate": 1.3279748619023995e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3824 + }, + { + "completion_length": 869.0000305175781, + "epoch": 0.583079268292683, + "grad_norm": 0.1458062046535164, + "kl": 0.0697021484375, + "learning_rate": 1.3271819836750297e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3825 + }, + { + "completion_length": 1460.3333740234375, + "epoch": 0.5832317073170732, + "grad_norm": 0.17665830363017032, + "kl": 0.063232421875, + "learning_rate": 1.326389154380008e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3826 + }, + { + "completion_length": 1744.0, + "epoch": 0.5833841463414634, + "grad_norm": 0.1486695653431822, + "kl": 0.0714111328125, + "learning_rate": 1.3255963742418184e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3827 + }, + { + "completion_length": 1172.1666870117188, + "epoch": 0.5835365853658536, + "grad_norm": 0.11600796799576192, + "kl": 0.087646484375, + "learning_rate": 1.3248036434849319e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3828 + }, + { + "completion_length": 1435.0000610351562, + "epoch": 0.583689024390244, + "grad_norm": 0.07144811429946855, + "kl": 0.06640625, + "learning_rate": 1.3240109623338058e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3829 + }, + { + "completion_length": 854.5000305175781, + "epoch": 0.5838414634146342, + "grad_norm": 0.1822648047490899, + "kl": 0.091064453125, + "learning_rate": 1.323218331012882e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3830 + }, + { + "completion_length": 1632.5001220703125, + "epoch": 0.5839939024390244, + "grad_norm": 0.17085006365716318, + "kl": 0.0567626953125, + "learning_rate": 1.32242574974659e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3831 + }, + { + "completion_length": 988.1666870117188, + "epoch": 0.5841463414634146, + "grad_norm": 1.7697339911978616, + "kl": 0.072509765625, + "learning_rate": 1.3216332187593434e-06, + "loss": 0.0029, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3832 + }, + { + "completion_length": 1747.6667175292969, + "epoch": 0.5842987804878049, + "grad_norm": 0.08144953768343231, + "kl": 0.061767578125, + "learning_rate": 1.3208407382755435e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3833 + }, + { + "completion_length": 1024.6667175292969, + "epoch": 0.5844512195121951, + "grad_norm": 0.1282022855639149, + "kl": 0.079833984375, + "learning_rate": 1.3200483085195748e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3834 + }, + { + "completion_length": 2125.166748046875, + "epoch": 0.5846036585365854, + "grad_norm": 0.0894215421656731, + "kl": 0.0673828125, + "learning_rate": 1.3192559297158099e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3835 + }, + { + "completion_length": 3409.33349609375, + "epoch": 0.5847560975609756, + "grad_norm": 0.043655858604759123, + "kl": 0.045166015625, + "learning_rate": 1.3184636020886058e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3836 + }, + { + "completion_length": 1951.3333740234375, + "epoch": 0.5849085365853659, + "grad_norm": 0.07016468928703687, + "kl": 0.0438232421875, + "learning_rate": 1.317671325862305e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3837 + }, + { + "completion_length": 1659.666748046875, + "epoch": 0.5850609756097561, + "grad_norm": 0.0810555911746924, + "kl": 0.072021484375, + "learning_rate": 1.3168791012612357e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3838 + }, + { + "completion_length": 1927.8333740234375, + "epoch": 0.5852134146341463, + "grad_norm": 0.10994566308486828, + "kl": 0.080078125, + "learning_rate": 1.3160869285097103e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3839 + }, + { + "completion_length": 1655.8333740234375, + "epoch": 0.5853658536585366, + "grad_norm": 0.07336521541611002, + "kl": 0.069091796875, + "learning_rate": 1.3152948078320297e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3840 + }, + { + "completion_length": 1645.3333740234375, + "epoch": 0.5855182926829269, + "grad_norm": 0.07932858117354914, + "kl": 0.054443359375, + "learning_rate": 1.3145027394524763e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3841 + }, + { + "completion_length": 681.6666870117188, + "epoch": 0.5856707317073171, + "grad_norm": 0.132438416127292, + "kl": 0.064453125, + "learning_rate": 1.3137107235953202e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3842 + }, + { + "completion_length": 598.5000305175781, + "epoch": 0.5858231707317073, + "grad_norm": 0.1363588335847283, + "kl": 0.087890625, + "learning_rate": 1.3129187604848152e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3843 + }, + { + "completion_length": 2414.3333740234375, + "epoch": 0.5859756097560975, + "grad_norm": 0.08984449555571843, + "kl": 0.0645751953125, + "learning_rate": 1.3121268503452014e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3844 + }, + { + "completion_length": 1063.6667175292969, + "epoch": 0.5861280487804879, + "grad_norm": 0.07488987925018019, + "kl": 0.055908203125, + "learning_rate": 1.3113349934007023e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3845 + }, + { + "completion_length": 480.6666717529297, + "epoch": 0.5862804878048781, + "grad_norm": 2.137650115814218, + "kl": 0.12451171875, + "learning_rate": 1.3105431898755278e-06, + "loss": 0.005, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3846 + }, + { + "completion_length": 1628.5, + "epoch": 0.5864329268292683, + "grad_norm": 0.10056596810338526, + "kl": 0.0653076171875, + "learning_rate": 1.3097514399938727e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3847 + }, + { + "completion_length": 865.6666870117188, + "epoch": 0.5865853658536585, + "grad_norm": 1.4633402326378189, + "kl": 0.0767822265625, + "learning_rate": 1.3089597439799151e-06, + "loss": 0.0031, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 3848 + }, + { + "completion_length": 2461.166748046875, + "epoch": 0.5867378048780488, + "grad_norm": 0.07005016908800335, + "kl": 0.063232421875, + "learning_rate": 1.3081681020578193e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3849 + }, + { + "completion_length": 2454.8333435058594, + "epoch": 0.586890243902439, + "grad_norm": 0.6939158307839591, + "kl": 0.0948486328125, + "learning_rate": 1.3073765144517334e-06, + "loss": 0.0038, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3850 + }, + { + "completion_length": 1344.5, + "epoch": 0.5870426829268293, + "grad_norm": 1.2227711483516917, + "kl": 0.091796875, + "learning_rate": 1.3065849813857913e-06, + "loss": 0.0037, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3851 + }, + { + "completion_length": 963.8333740234375, + "epoch": 0.5871951219512195, + "grad_norm": 0.16117971876068093, + "kl": 0.078857421875, + "learning_rate": 1.3057935030841096e-06, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3852 + }, + { + "completion_length": 2784.5001220703125, + "epoch": 0.5873475609756098, + "grad_norm": 0.930112606620772, + "kl": 0.0538330078125, + "learning_rate": 1.3050020797707913e-06, + "loss": 0.0022, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3853 + }, + { + "completion_length": 3463.666748046875, + "epoch": 0.5875, + "grad_norm": 0.07033137492143861, + "kl": 0.0509033203125, + "learning_rate": 1.304210711669923e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3854 + }, + { + "completion_length": 2050.0001220703125, + "epoch": 0.5876524390243902, + "grad_norm": 0.18803541158429155, + "kl": 0.086669921875, + "learning_rate": 1.303419399005575e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3855 + }, + { + "completion_length": 2323.3333740234375, + "epoch": 0.5878048780487805, + "grad_norm": 0.9186827327610215, + "kl": 0.066162109375, + "learning_rate": 1.3026281420018034e-06, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3856 + }, + { + "completion_length": 2492.8333740234375, + "epoch": 0.5879573170731708, + "grad_norm": 0.06061491955411288, + "kl": 0.0433349609375, + "learning_rate": 1.3018369408826468e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3857 + }, + { + "completion_length": 1834.666748046875, + "epoch": 0.588109756097561, + "grad_norm": 0.11533740577417728, + "kl": 0.05859375, + "learning_rate": 1.3010457958721292e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3858 + }, + { + "completion_length": 1731.0000610351562, + "epoch": 0.5882621951219512, + "grad_norm": 1.3811868094999655, + "kl": 0.112060546875, + "learning_rate": 1.3002547071942585e-06, + "loss": 0.0045, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3859 + }, + { + "completion_length": 2730.6666870117188, + "epoch": 0.5884146341463414, + "grad_norm": 0.1761299201990519, + "kl": 0.0777587890625, + "learning_rate": 1.2994636750730272e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3860 + }, + { + "completion_length": 2047.6666870117188, + "epoch": 0.5885670731707318, + "grad_norm": 0.15259148157523925, + "kl": 0.085693359375, + "learning_rate": 1.2986726997324102e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3861 + }, + { + "completion_length": 1672.6667175292969, + "epoch": 0.588719512195122, + "grad_norm": 0.16177851084960837, + "kl": 0.07666015625, + "learning_rate": 1.297881781396368e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3862 + }, + { + "completion_length": 1477.1666870117188, + "epoch": 0.5888719512195122, + "grad_norm": 0.1557091828610997, + "kl": 0.0927734375, + "learning_rate": 1.297090920288843e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3863 + }, + { + "completion_length": 2863.8333740234375, + "epoch": 0.5890243902439024, + "grad_norm": 0.11117385369243972, + "kl": 0.073974609375, + "learning_rate": 1.2963001166337642e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3864 + }, + { + "completion_length": 1493.5, + "epoch": 0.5891768292682927, + "grad_norm": 0.19976950202220464, + "kl": 0.071044921875, + "learning_rate": 1.2955093706550415e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3865 + }, + { + "completion_length": 2387.5, + "epoch": 0.589329268292683, + "grad_norm": 0.11730323509353882, + "kl": 0.075439453125, + "learning_rate": 1.2947186825765701e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3866 + }, + { + "completion_length": 1150.0000610351562, + "epoch": 0.5894817073170732, + "grad_norm": 0.13446139414512503, + "kl": 0.0679931640625, + "learning_rate": 1.2939280526222288e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3867 + }, + { + "completion_length": 1067.0000610351562, + "epoch": 0.5896341463414634, + "grad_norm": 0.10324613453579605, + "kl": 0.070556640625, + "learning_rate": 1.2931374810158788e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3868 + }, + { + "completion_length": 3201.3333740234375, + "epoch": 0.5897865853658537, + "grad_norm": 0.07529550619808069, + "kl": 0.063720703125, + "learning_rate": 1.2923469679813663e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3869 + }, + { + "completion_length": 2051.5000610351562, + "epoch": 0.5899390243902439, + "grad_norm": 0.0863513703235855, + "kl": 0.069580078125, + "learning_rate": 1.2915565137425196e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3870 + }, + { + "completion_length": 3286.166748046875, + "epoch": 0.5900914634146341, + "grad_norm": 0.08910282847368978, + "kl": 0.06982421875, + "learning_rate": 1.2907661185231514e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3871 + }, + { + "completion_length": 2586.666748046875, + "epoch": 0.5902439024390244, + "grad_norm": 0.06673141929451108, + "kl": 0.0638427734375, + "learning_rate": 1.2899757825470568e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3872 + }, + { + "completion_length": 2360.0000915527344, + "epoch": 0.5903963414634147, + "grad_norm": 0.063310847136842, + "kl": 0.048828125, + "learning_rate": 1.2891855060380144e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3873 + }, + { + "completion_length": 1430.3333435058594, + "epoch": 0.5905487804878049, + "grad_norm": 0.11280294797252752, + "kl": 0.082275390625, + "learning_rate": 1.2883952892197868e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3874 + }, + { + "completion_length": 1246.8333740234375, + "epoch": 0.5907012195121951, + "grad_norm": 0.147043723830377, + "kl": 0.08251953125, + "learning_rate": 1.2876051323161181e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3875 + }, + { + "completion_length": 1778.3334350585938, + "epoch": 0.5908536585365853, + "grad_norm": 0.07905768782109543, + "kl": 0.065673828125, + "learning_rate": 1.2868150355507365e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3876 + }, + { + "completion_length": 1762.8333740234375, + "epoch": 0.5910060975609757, + "grad_norm": 0.1029194971824324, + "kl": 0.07470703125, + "learning_rate": 1.2860249991473535e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3877 + }, + { + "completion_length": 1298.3333435058594, + "epoch": 0.5911585365853659, + "grad_norm": 0.08281796304873486, + "kl": 0.0709228515625, + "learning_rate": 1.285235023329663e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3878 + }, + { + "completion_length": 1695.5000610351562, + "epoch": 0.5913109756097561, + "grad_norm": 1.3175212691732914, + "kl": 0.08984375, + "learning_rate": 1.284445108321341e-06, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3879 + }, + { + "completion_length": 1322.3333435058594, + "epoch": 0.5914634146341463, + "grad_norm": 0.07864073227828539, + "kl": 0.0615234375, + "learning_rate": 1.283655254346048e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3880 + }, + { + "completion_length": 1939.666748046875, + "epoch": 0.5916158536585366, + "grad_norm": 0.13146204880331117, + "kl": 0.07177734375, + "learning_rate": 1.2828654616274255e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3881 + }, + { + "completion_length": 583.8333587646484, + "epoch": 0.5917682926829269, + "grad_norm": 0.20829588813337957, + "kl": 0.065673828125, + "learning_rate": 1.2820757303890988e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3882 + }, + { + "completion_length": 1253.5000610351562, + "epoch": 0.5919207317073171, + "grad_norm": 0.1106155023087071, + "kl": 0.085205078125, + "learning_rate": 1.281286060854675e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3883 + }, + { + "completion_length": 1995.0, + "epoch": 0.5920731707317073, + "grad_norm": 0.09049018532123417, + "kl": 0.069091796875, + "learning_rate": 1.2804964532477444e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3884 + }, + { + "completion_length": 2116.0, + "epoch": 0.5922256097560976, + "grad_norm": 0.07682445870638643, + "kl": 0.0596923828125, + "learning_rate": 1.2797069077918796e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3885 + }, + { + "completion_length": 1035.5000305175781, + "epoch": 0.5923780487804878, + "grad_norm": 0.13664429153354662, + "kl": 0.06494140625, + "learning_rate": 1.2789174247106353e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3886 + }, + { + "completion_length": 2365.666748046875, + "epoch": 0.592530487804878, + "grad_norm": 0.06832109096175583, + "kl": 0.0546875, + "learning_rate": 1.2781280042275489e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3887 + }, + { + "completion_length": 1147.3333740234375, + "epoch": 0.5926829268292683, + "grad_norm": 0.13404561336366003, + "kl": 0.072998046875, + "learning_rate": 1.2773386465661395e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3888 + }, + { + "completion_length": 1331.3333740234375, + "epoch": 0.5928353658536586, + "grad_norm": 0.10657030896502728, + "kl": 0.0672607421875, + "learning_rate": 1.2765493519499095e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3889 + }, + { + "completion_length": 1706.3333740234375, + "epoch": 0.5929878048780488, + "grad_norm": 0.07507577000348907, + "kl": 0.0810546875, + "learning_rate": 1.2757601206023417e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3890 + }, + { + "completion_length": 1320.1666870117188, + "epoch": 0.593140243902439, + "grad_norm": 1.9668560769188213, + "kl": 0.0859375, + "learning_rate": 1.274970952746903e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3891 + }, + { + "completion_length": 1712.0, + "epoch": 0.5932926829268292, + "grad_norm": 0.18044686742237637, + "kl": 0.07373046875, + "learning_rate": 1.2741818486070414e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3892 + }, + { + "completion_length": 2000.666748046875, + "epoch": 0.5934451219512196, + "grad_norm": 0.10454755949717578, + "kl": 0.073974609375, + "learning_rate": 1.2733928084061863e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3893 + }, + { + "completion_length": 1601.166748046875, + "epoch": 0.5935975609756098, + "grad_norm": 0.2567202937485912, + "kl": 0.077392578125, + "learning_rate": 1.2726038323677491e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3894 + }, + { + "completion_length": 1773.5, + "epoch": 0.59375, + "grad_norm": 1.0158268281111045, + "kl": 0.072998046875, + "learning_rate": 1.2718149207151249e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3895 + }, + { + "completion_length": 1614.5001220703125, + "epoch": 0.5939024390243902, + "grad_norm": 0.10070553214329381, + "kl": 0.058349609375, + "learning_rate": 1.2710260736716882e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3896 + }, + { + "completion_length": 2055.3333740234375, + "epoch": 0.5940548780487804, + "grad_norm": 0.1332170417094926, + "kl": 0.0491943359375, + "learning_rate": 1.2702372914607963e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3897 + }, + { + "completion_length": 3110.5, + "epoch": 0.5942073170731708, + "grad_norm": 0.05107731328564152, + "kl": 0.0648193359375, + "learning_rate": 1.269448574305788e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3898 + }, + { + "completion_length": 1479.166748046875, + "epoch": 0.594359756097561, + "grad_norm": 1.284654186098214, + "kl": 0.0540771484375, + "learning_rate": 1.2686599224299835e-06, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3899 + }, + { + "completion_length": 2227.5001220703125, + "epoch": 0.5945121951219512, + "grad_norm": 0.06240549660032568, + "kl": 0.05859375, + "learning_rate": 1.267871336056685e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3900 + }, + { + "completion_length": 1020.1667175292969, + "epoch": 0.5946646341463414, + "grad_norm": 0.2973278599703014, + "kl": 0.07080078125, + "learning_rate": 1.2670828154091757e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3901 + }, + { + "completion_length": 1057.8333740234375, + "epoch": 0.5948170731707317, + "grad_norm": 1.589210896507433, + "kl": 0.07421875, + "learning_rate": 1.26629436071072e-06, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3902 + }, + { + "completion_length": 1568.8333740234375, + "epoch": 0.594969512195122, + "grad_norm": 0.09375027649455465, + "kl": 0.069091796875, + "learning_rate": 1.2655059721845645e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3903 + }, + { + "completion_length": 2120.8334350585938, + "epoch": 0.5951219512195122, + "grad_norm": 0.14425157521810086, + "kl": 0.0570068359375, + "learning_rate": 1.264717650053936e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3904 + }, + { + "completion_length": 733.5000305175781, + "epoch": 0.5952743902439024, + "grad_norm": 0.17045066936376915, + "kl": 0.0628662109375, + "learning_rate": 1.263929394542044e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3905 + }, + { + "completion_length": 1490.1666870117188, + "epoch": 0.5954268292682927, + "grad_norm": 0.15774986075410366, + "kl": 0.065185546875, + "learning_rate": 1.2631412058720764e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3906 + }, + { + "completion_length": 1398.8333740234375, + "epoch": 0.5955792682926829, + "grad_norm": 1.6316521096929473, + "kl": 0.0751953125, + "learning_rate": 1.2623530842672055e-06, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3907 + }, + { + "completion_length": 2832.0, + "epoch": 0.5957317073170731, + "grad_norm": 0.09842543557143756, + "kl": 0.0660400390625, + "learning_rate": 1.261565029950582e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3908 + }, + { + "completion_length": 2948.3333740234375, + "epoch": 0.5958841463414634, + "grad_norm": 0.4070653062740381, + "kl": 0.06689453125, + "learning_rate": 1.2607770431453395e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3909 + }, + { + "completion_length": 3239.8333740234375, + "epoch": 0.5960365853658537, + "grad_norm": 0.057989881840960024, + "kl": 0.0604248046875, + "learning_rate": 1.2599891240745911e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3910 + }, + { + "completion_length": 2876.0, + "epoch": 0.5961890243902439, + "grad_norm": 0.08064041243131485, + "kl": 0.056640625, + "learning_rate": 1.2592012729614308e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3911 + }, + { + "completion_length": 2297.666717529297, + "epoch": 0.5963414634146341, + "grad_norm": 0.10134028134949671, + "kl": 0.0628662109375, + "learning_rate": 1.2584134900289346e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3912 + }, + { + "completion_length": 996.6666870117188, + "epoch": 0.5964939024390243, + "grad_norm": 0.1271213153821681, + "kl": 0.064208984375, + "learning_rate": 1.2576257755001573e-06, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 3913 + }, + { + "completion_length": 3199.3333740234375, + "epoch": 0.5966463414634147, + "grad_norm": 1.272063497416153, + "kl": 0.0531005859375, + "learning_rate": 1.2568381295981365e-06, + "loss": 0.0021, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3914 + }, + { + "completion_length": 3188.666748046875, + "epoch": 0.5967987804878049, + "grad_norm": 0.13872450694414898, + "kl": 0.060302734375, + "learning_rate": 1.2560505525458883e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3915 + }, + { + "completion_length": 4019.3333740234375, + "epoch": 0.5969512195121951, + "grad_norm": 0.057591423336761365, + "kl": 0.045654296875, + "learning_rate": 1.255263044566411e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3916 + }, + { + "completion_length": 1757.0001220703125, + "epoch": 0.5971036585365853, + "grad_norm": 0.0784072568146754, + "kl": 0.062255859375, + "learning_rate": 1.2544756058826824e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3917 + }, + { + "completion_length": 2245.8333435058594, + "epoch": 0.5972560975609756, + "grad_norm": 2.2411982468219427, + "kl": 0.076171875, + "learning_rate": 1.2536882367176606e-06, + "loss": 0.003, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3918 + }, + { + "completion_length": 1891.666748046875, + "epoch": 0.5974085365853659, + "grad_norm": 0.07746797276634663, + "kl": 0.063232421875, + "learning_rate": 1.2529009372942842e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3919 + }, + { + "completion_length": 3633.0, + "epoch": 0.5975609756097561, + "grad_norm": 0.06181912917001458, + "kl": 0.0531005859375, + "learning_rate": 1.2521137078354728e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3920 + }, + { + "completion_length": 849.3333435058594, + "epoch": 0.5977134146341463, + "grad_norm": 0.11446895691488927, + "kl": 0.054443359375, + "learning_rate": 1.2513265485641247e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3921 + }, + { + "completion_length": 1219.8333435058594, + "epoch": 0.5978658536585366, + "grad_norm": 0.13021075807828855, + "kl": 0.060546875, + "learning_rate": 1.2505394597031195e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3922 + }, + { + "completion_length": 2935.5, + "epoch": 0.5980182926829268, + "grad_norm": 0.07693061847874215, + "kl": 0.052001953125, + "learning_rate": 1.249752441475317e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3923 + }, + { + "completion_length": 1225.8333435058594, + "epoch": 0.598170731707317, + "grad_norm": 0.1242218621445693, + "kl": 0.077880859375, + "learning_rate": 1.248965494103556e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3924 + }, + { + "completion_length": 2317.8333435058594, + "epoch": 0.5983231707317073, + "grad_norm": 0.13251348779453168, + "kl": 0.0791015625, + "learning_rate": 1.2481786178106556e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3925 + }, + { + "completion_length": 1172.0000305175781, + "epoch": 0.5984756097560976, + "grad_norm": 0.10747984705884628, + "kl": 0.0556640625, + "learning_rate": 1.2473918128194153e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3926 + }, + { + "completion_length": 1511.6666870117188, + "epoch": 0.5986280487804878, + "grad_norm": 0.09074787294697734, + "kl": 0.0599365234375, + "learning_rate": 1.246605079352614e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3927 + }, + { + "completion_length": 1890.1666870117188, + "epoch": 0.598780487804878, + "grad_norm": 1.4812817853938078, + "kl": 0.072998046875, + "learning_rate": 1.2458184176330102e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3928 + }, + { + "completion_length": 1342.3333740234375, + "epoch": 0.5989329268292682, + "grad_norm": 0.11307148345378938, + "kl": 0.066650390625, + "learning_rate": 1.2450318278833424e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3929 + }, + { + "completion_length": 1743.3333435058594, + "epoch": 0.5990853658536586, + "grad_norm": 0.08353474692602288, + "kl": 0.05712890625, + "learning_rate": 1.2442453103263289e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3930 + }, + { + "completion_length": 1696.5, + "epoch": 0.5992378048780488, + "grad_norm": 0.10767338484291471, + "kl": 0.07080078125, + "learning_rate": 1.243458865184666e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3931 + }, + { + "completion_length": 1800.0000610351562, + "epoch": 0.599390243902439, + "grad_norm": 0.08052856382427001, + "kl": 0.058349609375, + "learning_rate": 1.2426724926810324e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3932 + }, + { + "completion_length": 2422.0, + "epoch": 0.5995426829268292, + "grad_norm": 0.0544628763446188, + "kl": 0.03778076171875, + "learning_rate": 1.2418861930380838e-06, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3933 + }, + { + "completion_length": 1346.3333740234375, + "epoch": 0.5996951219512195, + "grad_norm": 0.10453186996464951, + "kl": 0.08740234375, + "learning_rate": 1.2410999664784567e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3934 + }, + { + "completion_length": 2010.1666870117188, + "epoch": 0.5998475609756098, + "grad_norm": 0.08738054907136715, + "kl": 0.06787109375, + "learning_rate": 1.240313813224765e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3935 + }, + { + "completion_length": 1140.3333740234375, + "epoch": 0.6, + "grad_norm": 0.09622624431613144, + "kl": 0.0587158203125, + "learning_rate": 1.2395277334996047e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3936 + }, + { + "completion_length": 3337.3333740234375, + "epoch": 0.6001524390243902, + "grad_norm": 0.056822562984919826, + "kl": 0.0576171875, + "learning_rate": 1.2387417275255477e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3937 + }, + { + "completion_length": 1875.166748046875, + "epoch": 0.6003048780487805, + "grad_norm": 0.808290132883168, + "kl": 0.056396484375, + "learning_rate": 1.2379557955251482e-06, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3938 + }, + { + "completion_length": 968.0, + "epoch": 0.6004573170731707, + "grad_norm": 0.09539273808529156, + "kl": 0.0528564453125, + "learning_rate": 1.237169937720937e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3939 + }, + { + "completion_length": 3692.0, + "epoch": 0.600609756097561, + "grad_norm": 0.04787838809194441, + "kl": 0.0557861328125, + "learning_rate": 1.2363841543354249e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3940 + }, + { + "completion_length": 934.1666870117188, + "epoch": 0.6007621951219512, + "grad_norm": 0.12494115013323409, + "kl": 0.06689453125, + "learning_rate": 1.2355984455911022e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3941 + }, + { + "completion_length": 678.0000305175781, + "epoch": 0.6009146341463415, + "grad_norm": 0.11652512402123064, + "kl": 0.0755615234375, + "learning_rate": 1.234812811710437e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3942 + }, + { + "completion_length": 726.6666870117188, + "epoch": 0.6010670731707317, + "grad_norm": 0.09967842844744344, + "kl": 0.0657958984375, + "learning_rate": 1.234027252915877e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3943 + }, + { + "completion_length": 1472.8333740234375, + "epoch": 0.6012195121951219, + "grad_norm": 0.17258098403037345, + "kl": 0.072265625, + "learning_rate": 1.2332417694298477e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3944 + }, + { + "completion_length": 1863.1666870117188, + "epoch": 0.6013719512195121, + "grad_norm": 0.9952120964870805, + "kl": 0.0570068359375, + "learning_rate": 1.2324563614747544e-06, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3945 + }, + { + "completion_length": 2567.0001220703125, + "epoch": 0.6015243902439025, + "grad_norm": 0.17804251107180194, + "kl": 0.080322265625, + "learning_rate": 1.2316710292729803e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3946 + }, + { + "completion_length": 3219.0001220703125, + "epoch": 0.6016768292682927, + "grad_norm": 0.07615713459094076, + "kl": 0.060791015625, + "learning_rate": 1.2308857730468872e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3947 + }, + { + "completion_length": 3642.666748046875, + "epoch": 0.6018292682926829, + "grad_norm": 0.04471416381486922, + "kl": 0.0577392578125, + "learning_rate": 1.2301005930188156e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3948 + }, + { + "completion_length": 2360.1666870117188, + "epoch": 0.6019817073170731, + "grad_norm": 0.09207195559022568, + "kl": 0.0528564453125, + "learning_rate": 1.2293154894110844e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3949 + }, + { + "completion_length": 1584.8334350585938, + "epoch": 0.6021341463414634, + "grad_norm": 0.2261953664124816, + "kl": 0.09521484375, + "learning_rate": 1.2285304624459907e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3950 + }, + { + "completion_length": 1576.8333740234375, + "epoch": 0.6022865853658537, + "grad_norm": 0.09058131765169276, + "kl": 0.09130859375, + "learning_rate": 1.2277455123458104e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3951 + }, + { + "completion_length": 3109.166748046875, + "epoch": 0.6024390243902439, + "grad_norm": 0.09832492507200642, + "kl": 0.0550537109375, + "learning_rate": 1.2269606393327968e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3952 + }, + { + "completion_length": 1514.5, + "epoch": 0.6025914634146341, + "grad_norm": 0.11548839513347652, + "kl": 0.0723876953125, + "learning_rate": 1.226175843629182e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3953 + }, + { + "completion_length": 3848.0, + "epoch": 0.6027439024390244, + "grad_norm": 0.04046827426713901, + "kl": 0.04345703125, + "learning_rate": 1.2253911254571761e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3954 + }, + { + "completion_length": 1460.5, + "epoch": 0.6028963414634146, + "grad_norm": 0.1059613983780725, + "kl": 0.0643310546875, + "learning_rate": 1.2246064850389671e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3955 + }, + { + "completion_length": 3377.5, + "epoch": 0.6030487804878049, + "grad_norm": 0.047968808982969714, + "kl": 0.055908203125, + "learning_rate": 1.223821922596721e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3956 + }, + { + "completion_length": 2038.5, + "epoch": 0.6032012195121951, + "grad_norm": 0.11097459161349223, + "kl": 0.0518798828125, + "learning_rate": 1.223037438352582e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3957 + }, + { + "completion_length": 3958.666748046875, + "epoch": 0.6033536585365854, + "grad_norm": 0.037540869760365275, + "kl": 0.0482177734375, + "learning_rate": 1.2222530325286716e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3958 + }, + { + "completion_length": 2180.3334350585938, + "epoch": 0.6035060975609756, + "grad_norm": 1.0993920143925668, + "kl": 0.071533203125, + "learning_rate": 1.2214687053470897e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3959 + }, + { + "completion_length": 1517.8333740234375, + "epoch": 0.6036585365853658, + "grad_norm": 0.07205726829637932, + "kl": 0.0521240234375, + "learning_rate": 1.2206844570299133e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3960 + }, + { + "completion_length": 2330.0, + "epoch": 0.603810975609756, + "grad_norm": 0.14244231074689567, + "kl": 0.08740234375, + "learning_rate": 1.219900287799198e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3961 + }, + { + "completion_length": 2123.3334350585938, + "epoch": 0.6039634146341464, + "grad_norm": 0.08896608193339953, + "kl": 0.0570068359375, + "learning_rate": 1.219116197876976e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3962 + }, + { + "completion_length": 2046.166748046875, + "epoch": 0.6041158536585366, + "grad_norm": 0.08159243608406774, + "kl": 0.063232421875, + "learning_rate": 1.2183321874852578e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3963 + }, + { + "completion_length": 3524.0, + "epoch": 0.6042682926829268, + "grad_norm": 0.0338198083021908, + "kl": 0.0479736328125, + "learning_rate": 1.2175482568460306e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3964 + }, + { + "completion_length": 2836.5, + "epoch": 0.604420731707317, + "grad_norm": 0.08728300696993684, + "kl": 0.069580078125, + "learning_rate": 1.21676440618126e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3965 + }, + { + "completion_length": 2167.166748046875, + "epoch": 0.6045731707317074, + "grad_norm": 0.11914938989863967, + "kl": 0.064453125, + "learning_rate": 1.2159806357128886e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3966 + }, + { + "completion_length": 1243.166748046875, + "epoch": 0.6047256097560976, + "grad_norm": 0.10132668865427465, + "kl": 0.0657958984375, + "learning_rate": 1.2151969456628353e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3967 + }, + { + "completion_length": 1923.0000610351562, + "epoch": 0.6048780487804878, + "grad_norm": 0.09662732141384814, + "kl": 0.0650634765625, + "learning_rate": 1.2144133362529974e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3968 + }, + { + "completion_length": 2000.3334350585938, + "epoch": 0.605030487804878, + "grad_norm": 0.08129832407825895, + "kl": 0.0712890625, + "learning_rate": 1.2136298077052497e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3969 + }, + { + "completion_length": 2753.3333740234375, + "epoch": 0.6051829268292683, + "grad_norm": 0.07980397349899875, + "kl": 0.068115234375, + "learning_rate": 1.2128463602414424e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3970 + }, + { + "completion_length": 1259.0000610351562, + "epoch": 0.6053353658536585, + "grad_norm": 0.0965397872452194, + "kl": 0.0635986328125, + "learning_rate": 1.2120629940834046e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3971 + }, + { + "completion_length": 3332.3333740234375, + "epoch": 0.6054878048780488, + "grad_norm": 0.0687955038328268, + "kl": 0.0562744140625, + "learning_rate": 1.2112797094529417e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3972 + }, + { + "completion_length": 1525.6666870117188, + "epoch": 0.605640243902439, + "grad_norm": 0.06599934357586762, + "kl": 0.0537109375, + "learning_rate": 1.210496506571835e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3973 + }, + { + "completion_length": 1531.5000610351562, + "epoch": 0.6057926829268293, + "grad_norm": 0.1161793634387897, + "kl": 0.0574951171875, + "learning_rate": 1.2097133856618442e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3974 + }, + { + "completion_length": 588.6666870117188, + "epoch": 0.6059451219512195, + "grad_norm": 0.1462302602270543, + "kl": 0.0791015625, + "learning_rate": 1.208930346944705e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3975 + }, + { + "completion_length": 1460.6666870117188, + "epoch": 0.6060975609756097, + "grad_norm": 0.11578024638553772, + "kl": 0.078857421875, + "learning_rate": 1.2081473906421298e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3976 + }, + { + "completion_length": 1388.8333740234375, + "epoch": 0.60625, + "grad_norm": 0.11905386331613192, + "kl": 0.05908203125, + "learning_rate": 1.2073645169758077e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3977 + }, + { + "completion_length": 1055.1666870117188, + "epoch": 0.6064024390243903, + "grad_norm": 2.6970367996972753, + "kl": 0.07861328125, + "learning_rate": 1.2065817261674046e-06, + "loss": 0.0031, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 3978 + }, + { + "completion_length": 1333.6666870117188, + "epoch": 0.6065548780487805, + "grad_norm": 0.08522099587889523, + "kl": 0.0682373046875, + "learning_rate": 1.2057990184385634e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3979 + }, + { + "completion_length": 3538.3333740234375, + "epoch": 0.6067073170731707, + "grad_norm": 0.08786928380972372, + "kl": 0.0531005859375, + "learning_rate": 1.205016394010902e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3980 + }, + { + "completion_length": 2675.3333740234375, + "epoch": 0.6068597560975609, + "grad_norm": 0.07143747276965706, + "kl": 0.056884765625, + "learning_rate": 1.2042338531060165e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3981 + }, + { + "completion_length": 2192.0000610351562, + "epoch": 0.6070121951219513, + "grad_norm": 0.08477135372300638, + "kl": 0.05615234375, + "learning_rate": 1.2034513959454778e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3982 + }, + { + "completion_length": 1253.6667175292969, + "epoch": 0.6071646341463415, + "grad_norm": 0.20584516064724662, + "kl": 0.100341796875, + "learning_rate": 1.2026690227508346e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3983 + }, + { + "completion_length": 3011.6666870117188, + "epoch": 0.6073170731707317, + "grad_norm": 1.2863437217391689, + "kl": 0.06201171875, + "learning_rate": 1.20188673374361e-06, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3984 + }, + { + "completion_length": 1777.5000610351562, + "epoch": 0.6074695121951219, + "grad_norm": 1.4049930413137544, + "kl": 0.08349609375, + "learning_rate": 1.2011045291453047e-06, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3985 + }, + { + "completion_length": 1401.0000915527344, + "epoch": 0.6076219512195122, + "grad_norm": 0.10638403874858139, + "kl": 0.071044921875, + "learning_rate": 1.2003224091773959e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3986 + }, + { + "completion_length": 1844.1666870117188, + "epoch": 0.6077743902439025, + "grad_norm": 0.11499500121048108, + "kl": 0.08154296875, + "learning_rate": 1.199540374061334e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3987 + }, + { + "completion_length": 1613.1667175292969, + "epoch": 0.6079268292682927, + "grad_norm": 0.08745020408920709, + "kl": 0.0697021484375, + "learning_rate": 1.1987584240185492e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3988 + }, + { + "completion_length": 1699.1666870117188, + "epoch": 0.6080792682926829, + "grad_norm": 0.08343400451177166, + "kl": 0.048828125, + "learning_rate": 1.197976559270446e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3989 + }, + { + "completion_length": 2291.8333435058594, + "epoch": 0.6082317073170732, + "grad_norm": 0.09375454351239396, + "kl": 0.053955078125, + "learning_rate": 1.1971947800384028e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3990 + }, + { + "completion_length": 2543.0, + "epoch": 0.6083841463414634, + "grad_norm": 1.9638020260347657, + "kl": 0.0665283203125, + "learning_rate": 1.1964130865437769e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 3991 + }, + { + "completion_length": 1399.8333435058594, + "epoch": 0.6085365853658536, + "grad_norm": 0.0903211150623174, + "kl": 0.074951171875, + "learning_rate": 1.1956314790078998e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3992 + }, + { + "completion_length": 2403.0000915527344, + "epoch": 0.6086890243902439, + "grad_norm": 0.05842016338942718, + "kl": 0.0411376953125, + "learning_rate": 1.1948499576520784e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3993 + }, + { + "completion_length": 2411.0, + "epoch": 0.6088414634146342, + "grad_norm": 0.053485724430193886, + "kl": 0.0400390625, + "learning_rate": 1.194068522697596e-06, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3994 + }, + { + "completion_length": 2500.0, + "epoch": 0.6089939024390244, + "grad_norm": 0.05794107278994024, + "kl": 0.061279296875, + "learning_rate": 1.1932871743657106e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3995 + }, + { + "completion_length": 2113.5, + "epoch": 0.6091463414634146, + "grad_norm": 0.0723481098996075, + "kl": 0.0579833984375, + "learning_rate": 1.1925059128776567e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3996 + }, + { + "completion_length": 2743.666748046875, + "epoch": 0.6092987804878048, + "grad_norm": 0.0627791182340654, + "kl": 0.05224609375, + "learning_rate": 1.1917247384546434e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3997 + }, + { + "completion_length": 772.1666870117188, + "epoch": 0.6094512195121952, + "grad_norm": 0.09966433725218235, + "kl": 0.0469970703125, + "learning_rate": 1.1909436513178548e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 3998 + }, + { + "completion_length": 2076.5000915527344, + "epoch": 0.6096036585365854, + "grad_norm": 0.13715054625455808, + "kl": 0.068359375, + "learning_rate": 1.1901626516884519e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 3999 + }, + { + "completion_length": 3574.5001220703125, + "epoch": 0.6097560975609756, + "grad_norm": 0.03408722318692621, + "kl": 0.046630859375, + "learning_rate": 1.189381739787569e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4000 + }, + { + "completion_length": 2822.5, + "epoch": 0.6099085365853658, + "grad_norm": 0.08805335203622915, + "kl": 0.065673828125, + "learning_rate": 1.1886009158363173e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4001 + }, + { + "completion_length": 1926.5000610351562, + "epoch": 0.6100609756097561, + "grad_norm": 0.08355135721447253, + "kl": 0.056396484375, + "learning_rate": 1.1878201800557814e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4002 + }, + { + "completion_length": 3487.666748046875, + "epoch": 0.6102134146341464, + "grad_norm": 0.06225733792754516, + "kl": 0.0531005859375, + "learning_rate": 1.1870395326670222e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4003 + }, + { + "completion_length": 2373.5001220703125, + "epoch": 0.6103658536585366, + "grad_norm": 0.9807288905198768, + "kl": 0.0673828125, + "learning_rate": 1.1862589738910754e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4004 + }, + { + "completion_length": 2499.6666870117188, + "epoch": 0.6105182926829268, + "grad_norm": 0.07940532209208233, + "kl": 0.0599365234375, + "learning_rate": 1.1854785039489502e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4005 + }, + { + "completion_length": 3101.0001220703125, + "epoch": 0.6106707317073171, + "grad_norm": 0.9806412393483811, + "kl": 0.051025390625, + "learning_rate": 1.1846981230616334e-06, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4006 + }, + { + "completion_length": 1654.8333435058594, + "epoch": 0.6108231707317073, + "grad_norm": 0.0924208753989707, + "kl": 0.0518798828125, + "learning_rate": 1.1839178314500845e-06, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4007 + }, + { + "completion_length": 2899.33349609375, + "epoch": 0.6109756097560975, + "grad_norm": 0.04073039570438592, + "kl": 0.05078125, + "learning_rate": 1.1831376293352378e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4008 + }, + { + "completion_length": 3678.33349609375, + "epoch": 0.6111280487804878, + "grad_norm": 0.041341999095760196, + "kl": 0.0467529296875, + "learning_rate": 1.182357516938003e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4009 + }, + { + "completion_length": 2996.5, + "epoch": 0.6112804878048781, + "grad_norm": 0.08421987103447501, + "kl": 0.062255859375, + "learning_rate": 1.181577494479264e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4010 + }, + { + "completion_length": 1929.8334350585938, + "epoch": 0.6114329268292683, + "grad_norm": 0.06479940211853868, + "kl": 0.0616455078125, + "learning_rate": 1.1807975621798793e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4011 + }, + { + "completion_length": 3300.5, + "epoch": 0.6115853658536585, + "grad_norm": 0.0553419195252943, + "kl": 0.05224609375, + "learning_rate": 1.180017720260682e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4012 + }, + { + "completion_length": 1901.1667175292969, + "epoch": 0.6117378048780487, + "grad_norm": 1.733552454933503, + "kl": 0.0765380859375, + "learning_rate": 1.1792379689424793e-06, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4013 + }, + { + "completion_length": 2169.8333740234375, + "epoch": 0.6118902439024391, + "grad_norm": 0.07695357706546893, + "kl": 0.0545654296875, + "learning_rate": 1.1784583084460532e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4014 + }, + { + "completion_length": 1149.8333740234375, + "epoch": 0.6120426829268293, + "grad_norm": 0.09558830300150067, + "kl": 0.061767578125, + "learning_rate": 1.1776787389921593e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4015 + }, + { + "completion_length": 621.1666717529297, + "epoch": 0.6121951219512195, + "grad_norm": 0.24623494065847046, + "kl": 0.063720703125, + "learning_rate": 1.176899260801528e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4016 + }, + { + "completion_length": 3543.166748046875, + "epoch": 0.6123475609756097, + "grad_norm": 0.03924013108967147, + "kl": 0.0469970703125, + "learning_rate": 1.1761198740948645e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4017 + }, + { + "completion_length": 3129.8333740234375, + "epoch": 0.6125, + "grad_norm": 0.06358681541774307, + "kl": 0.05029296875, + "learning_rate": 1.1753405790928457e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4018 + }, + { + "completion_length": 2512.0, + "epoch": 0.6126524390243903, + "grad_norm": 1.604138941117521, + "kl": 0.06787109375, + "learning_rate": 1.1745613760161258e-06, + "loss": 0.0027, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4019 + }, + { + "completion_length": 2028.0001220703125, + "epoch": 0.6128048780487805, + "grad_norm": 0.060440236904943684, + "kl": 0.061767578125, + "learning_rate": 1.1737822650853301e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4020 + }, + { + "completion_length": 3199.5001220703125, + "epoch": 0.6129573170731707, + "grad_norm": 0.042564954186249136, + "kl": 0.0477294921875, + "learning_rate": 1.17300324652106e-06, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4021 + }, + { + "completion_length": 863.0, + "epoch": 0.613109756097561, + "grad_norm": 2.664795060008636, + "kl": 0.0902099609375, + "learning_rate": 1.1722243205438889e-06, + "loss": 0.0036, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4022 + }, + { + "completion_length": 1669.6667175292969, + "epoch": 0.6132621951219512, + "grad_norm": 0.09115538825757699, + "kl": 0.067138671875, + "learning_rate": 1.1714454873743651e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4023 + }, + { + "completion_length": 1909.3333740234375, + "epoch": 0.6134146341463415, + "grad_norm": 1.573384660291711, + "kl": 0.052734375, + "learning_rate": 1.1706667472330101e-06, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4024 + }, + { + "completion_length": 898.6666870117188, + "epoch": 0.6135670731707317, + "grad_norm": 1.8845525751192438, + "kl": 0.082275390625, + "learning_rate": 1.1698881003403205e-06, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4025 + }, + { + "completion_length": 1539.3333740234375, + "epoch": 0.613719512195122, + "grad_norm": 0.14465255490514178, + "kl": 0.0626220703125, + "learning_rate": 1.1691095469167642e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4026 + }, + { + "completion_length": 2095.5000915527344, + "epoch": 0.6138719512195122, + "grad_norm": 0.07589558962885785, + "kl": 0.0565185546875, + "learning_rate": 1.1683310871827839e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4027 + }, + { + "completion_length": 3174.8333740234375, + "epoch": 0.6140243902439024, + "grad_norm": 1.274638702730334, + "kl": 0.054931640625, + "learning_rate": 1.1675527213587963e-06, + "loss": 0.0022, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4028 + }, + { + "completion_length": 2639.3333740234375, + "epoch": 0.6141768292682926, + "grad_norm": 0.110482782440555, + "kl": 0.060546875, + "learning_rate": 1.1667744496651902e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4029 + }, + { + "completion_length": 1936.5, + "epoch": 0.614329268292683, + "grad_norm": 0.8818557055312921, + "kl": 0.057861328125, + "learning_rate": 1.165996272322329e-06, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4030 + }, + { + "completion_length": 1872.666748046875, + "epoch": 0.6144817073170732, + "grad_norm": 1.122229752108808, + "kl": 0.0657958984375, + "learning_rate": 1.1652181895505478e-06, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4031 + }, + { + "completion_length": 2530.8333740234375, + "epoch": 0.6146341463414634, + "grad_norm": 0.07396348120695131, + "kl": 0.0443115234375, + "learning_rate": 1.1644402015701568e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4032 + }, + { + "completion_length": 954.5, + "epoch": 0.6147865853658536, + "grad_norm": 0.11869301516897668, + "kl": 0.0830078125, + "learning_rate": 1.1636623086014381e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4033 + }, + { + "completion_length": 1883.5, + "epoch": 0.614939024390244, + "grad_norm": 0.12165763240141574, + "kl": 0.06494140625, + "learning_rate": 1.1628845108646468e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4034 + }, + { + "completion_length": 1468.5, + "epoch": 0.6150914634146342, + "grad_norm": 1.3675442746775563, + "kl": 0.1025390625, + "learning_rate": 1.1621068085800128e-06, + "loss": 0.0041, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4035 + }, + { + "completion_length": 1776.1667175292969, + "epoch": 0.6152439024390244, + "grad_norm": 0.09004427541825699, + "kl": 0.044677734375, + "learning_rate": 1.1613292019677364e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4036 + }, + { + "completion_length": 2007.0000610351562, + "epoch": 0.6153963414634146, + "grad_norm": 0.16296443495770818, + "kl": 0.076416015625, + "learning_rate": 1.1605516912479928e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4037 + }, + { + "completion_length": 1537.8333740234375, + "epoch": 0.6155487804878049, + "grad_norm": 2.348303795122743, + "kl": 0.0986328125, + "learning_rate": 1.1597742766409286e-06, + "loss": 0.0039, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4038 + }, + { + "completion_length": 2774.166748046875, + "epoch": 0.6157012195121951, + "grad_norm": 0.07710551992497065, + "kl": 0.0645751953125, + "learning_rate": 1.1589969583666651e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4039 + }, + { + "completion_length": 774.3333587646484, + "epoch": 0.6158536585365854, + "grad_norm": 0.15583601838113986, + "kl": 0.107177734375, + "learning_rate": 1.158219736645294e-06, + "loss": 0.0043, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4040 + }, + { + "completion_length": 1356.5, + "epoch": 0.6160060975609756, + "grad_norm": 0.11187120459953892, + "kl": 0.0751953125, + "learning_rate": 1.1574426116968811e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4041 + }, + { + "completion_length": 2286.5, + "epoch": 0.6161585365853659, + "grad_norm": 1.7329524487616532, + "kl": 0.07861328125, + "learning_rate": 1.1566655837414645e-06, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4042 + }, + { + "completion_length": 1676.166748046875, + "epoch": 0.6163109756097561, + "grad_norm": 0.17582818945851222, + "kl": 0.076416015625, + "learning_rate": 1.1558886529990554e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4043 + }, + { + "completion_length": 529.0, + "epoch": 0.6164634146341463, + "grad_norm": 2.0704048756420574, + "kl": 0.10498046875, + "learning_rate": 1.1551118196896364e-06, + "loss": 0.0042, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4044 + }, + { + "completion_length": 1441.0000610351562, + "epoch": 0.6166158536585366, + "grad_norm": 0.19249627943712297, + "kl": 0.07275390625, + "learning_rate": 1.1543350840331634e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4045 + }, + { + "completion_length": 1098.3333740234375, + "epoch": 0.6167682926829269, + "grad_norm": 0.10178751296614462, + "kl": 0.081787109375, + "learning_rate": 1.1535584462495635e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4046 + }, + { + "completion_length": 2360.8333435058594, + "epoch": 0.6169207317073171, + "grad_norm": 0.06845427523867313, + "kl": 0.0478515625, + "learning_rate": 1.1527819065587375e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4047 + }, + { + "completion_length": 1904.3333740234375, + "epoch": 0.6170731707317073, + "grad_norm": 0.08761671319512374, + "kl": 0.0714111328125, + "learning_rate": 1.152005465180558e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4048 + }, + { + "completion_length": 2099.8334350585938, + "epoch": 0.6172256097560975, + "grad_norm": 1.923162647395587, + "kl": 0.071044921875, + "learning_rate": 1.1512291223348688e-06, + "loss": 0.0028, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 4049 + }, + { + "completion_length": 3607.0, + "epoch": 0.6173780487804879, + "grad_norm": 0.034507176568234654, + "kl": 0.0469970703125, + "learning_rate": 1.1504528782414872e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4050 + }, + { + "completion_length": 1680.5000915527344, + "epoch": 0.6175304878048781, + "grad_norm": 0.1601438083651295, + "kl": 0.0672607421875, + "learning_rate": 1.1496767331202016e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4051 + }, + { + "completion_length": 2447.5, + "epoch": 0.6176829268292683, + "grad_norm": 0.08723124607526686, + "kl": 0.0609130859375, + "learning_rate": 1.1489006871907728e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4052 + }, + { + "completion_length": 1987.5000610351562, + "epoch": 0.6178353658536585, + "grad_norm": 0.08996846099112935, + "kl": 0.06591796875, + "learning_rate": 1.148124740672933e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4053 + }, + { + "completion_length": 2026.5001220703125, + "epoch": 0.6179878048780488, + "grad_norm": 0.1902437037525233, + "kl": 0.08203125, + "learning_rate": 1.1473488937863869e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4054 + }, + { + "completion_length": 2543.0, + "epoch": 0.618140243902439, + "grad_norm": 0.06703587643711695, + "kl": 0.0526123046875, + "learning_rate": 1.1465731467508112e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4055 + }, + { + "completion_length": 1905.8333435058594, + "epoch": 0.6182926829268293, + "grad_norm": 0.18610639342184826, + "kl": 0.075439453125, + "learning_rate": 1.145797499785853e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4056 + }, + { + "completion_length": 3323.0, + "epoch": 0.6184451219512195, + "grad_norm": 0.0730886063499632, + "kl": 0.044677734375, + "learning_rate": 1.1450219531111328e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4057 + }, + { + "completion_length": 2325.8333435058594, + "epoch": 0.6185975609756098, + "grad_norm": 0.08574439951673092, + "kl": 0.0589599609375, + "learning_rate": 1.1442465069462408e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4058 + }, + { + "completion_length": 2427.5, + "epoch": 0.61875, + "grad_norm": 0.09825070881540146, + "kl": 0.0570068359375, + "learning_rate": 1.1434711615107406e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4059 + }, + { + "completion_length": 2804.5, + "epoch": 0.6189024390243902, + "grad_norm": 0.09129176232563661, + "kl": 0.0576171875, + "learning_rate": 1.1426959170241663e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4060 + }, + { + "completion_length": 3109.3333740234375, + "epoch": 0.6190548780487805, + "grad_norm": 0.042453581195370176, + "kl": 0.0435791015625, + "learning_rate": 1.1419207737060228e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4061 + }, + { + "completion_length": 3469.8333740234375, + "epoch": 0.6192073170731708, + "grad_norm": 0.07637445609606426, + "kl": 0.0552978515625, + "learning_rate": 1.141145731775788e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4062 + }, + { + "completion_length": 3164.0, + "epoch": 0.619359756097561, + "grad_norm": 0.0573507407123113, + "kl": 0.0557861328125, + "learning_rate": 1.1403707914529106e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4063 + }, + { + "completion_length": 3343.5001220703125, + "epoch": 0.6195121951219512, + "grad_norm": 0.06291491784834606, + "kl": 0.0625, + "learning_rate": 1.1395959529568088e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4064 + }, + { + "completion_length": 899.1667175292969, + "epoch": 0.6196646341463414, + "grad_norm": 0.10598959673211017, + "kl": 0.07568359375, + "learning_rate": 1.1388212165068741e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4065 + }, + { + "completion_length": 1105.3333740234375, + "epoch": 0.6198170731707318, + "grad_norm": 0.1716015007167223, + "kl": 0.0709228515625, + "learning_rate": 1.1380465823224688e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4066 + }, + { + "completion_length": 2027.6667175292969, + "epoch": 0.619969512195122, + "grad_norm": 0.09188236422792644, + "kl": 0.071533203125, + "learning_rate": 1.1372720506229247e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4067 + }, + { + "completion_length": 1759.6667175292969, + "epoch": 0.6201219512195122, + "grad_norm": 0.0933681163098677, + "kl": 0.0557861328125, + "learning_rate": 1.1364976216275462e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4068 + }, + { + "completion_length": 3130.666748046875, + "epoch": 0.6202743902439024, + "grad_norm": 1.2013893204758372, + "kl": 0.05908203125, + "learning_rate": 1.1357232955556078e-06, + "loss": 0.0024, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4069 + }, + { + "completion_length": 2430.166748046875, + "epoch": 0.6204268292682927, + "grad_norm": 0.38052485970199645, + "kl": 0.07080078125, + "learning_rate": 1.1349490726263555e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4070 + }, + { + "completion_length": 1990.3334350585938, + "epoch": 0.620579268292683, + "grad_norm": 0.08347100617396863, + "kl": 0.05859375, + "learning_rate": 1.1341749530590053e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4071 + }, + { + "completion_length": 2968.666748046875, + "epoch": 0.6207317073170732, + "grad_norm": 0.07822770185239855, + "kl": 0.05908203125, + "learning_rate": 1.1334009370727446e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4072 + }, + { + "completion_length": 960.0000305175781, + "epoch": 0.6208841463414634, + "grad_norm": 0.17458890319983614, + "kl": 0.072509765625, + "learning_rate": 1.1326270248867312e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4073 + }, + { + "completion_length": 1779.8333740234375, + "epoch": 0.6210365853658537, + "grad_norm": 0.984454577627025, + "kl": 0.063232421875, + "learning_rate": 1.1318532167200937e-06, + "loss": 0.0025, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 4074 + }, + { + "completion_length": 1545.666748046875, + "epoch": 0.6211890243902439, + "grad_norm": 0.15084694050350841, + "kl": 0.0859375, + "learning_rate": 1.1310795127919308e-06, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4075 + }, + { + "completion_length": 2052.3334350585938, + "epoch": 0.6213414634146341, + "grad_norm": 0.0898091690105366, + "kl": 0.0460205078125, + "learning_rate": 1.1303059133213115e-06, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4076 + }, + { + "completion_length": 4096.0, + "epoch": 0.6214939024390244, + "grad_norm": 0.03563709614445438, + "kl": 0.050048828125, + "learning_rate": 1.1295324185272768e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4077 + }, + { + "completion_length": 3288.5, + "epoch": 0.6216463414634147, + "grad_norm": 0.0743661244084289, + "kl": 0.051025390625, + "learning_rate": 1.128759028628836e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4078 + }, + { + "completion_length": 2046.666748046875, + "epoch": 0.6217987804878049, + "grad_norm": 0.11508379520657813, + "kl": 0.072021484375, + "learning_rate": 1.1279857438449695e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4079 + }, + { + "completion_length": 978.6666870117188, + "epoch": 0.6219512195121951, + "grad_norm": 0.14055811069006016, + "kl": 0.08154296875, + "learning_rate": 1.127212564394629e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4080 + }, + { + "completion_length": 3132.3333740234375, + "epoch": 0.6221036585365853, + "grad_norm": 0.08644647818272139, + "kl": 0.0589599609375, + "learning_rate": 1.1264394904967355e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4081 + }, + { + "completion_length": 1527.1666870117188, + "epoch": 0.6222560975609757, + "grad_norm": 0.09250452308720755, + "kl": 0.0614013671875, + "learning_rate": 1.1256665223701792e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4082 + }, + { + "completion_length": 1957.5001220703125, + "epoch": 0.6224085365853659, + "grad_norm": 0.1454972905322493, + "kl": 0.080322265625, + "learning_rate": 1.124893660233822e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4083 + }, + { + "completion_length": 2209.666748046875, + "epoch": 0.6225609756097561, + "grad_norm": 0.07705970786207035, + "kl": 0.070068359375, + "learning_rate": 1.1241209043064944e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4084 + }, + { + "completion_length": 939.0, + "epoch": 0.6227134146341463, + "grad_norm": 0.13712831415664128, + "kl": 0.109375, + "learning_rate": 1.1233482548069975e-06, + "loss": 0.0044, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4085 + }, + { + "completion_length": 1708.8333740234375, + "epoch": 0.6228658536585366, + "grad_norm": 1.1266423829160004, + "kl": 0.07958984375, + "learning_rate": 1.1225757119541027e-06, + "loss": 0.0032, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4086 + }, + { + "completion_length": 2026.0000915527344, + "epoch": 0.6230182926829269, + "grad_norm": 0.9290992681005956, + "kl": 0.068115234375, + "learning_rate": 1.1218032759665505e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4087 + }, + { + "completion_length": 1863.5, + "epoch": 0.6231707317073171, + "grad_norm": 0.10859147777834247, + "kl": 0.081298828125, + "learning_rate": 1.1210309470630509e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4088 + }, + { + "completion_length": 3196.8333740234375, + "epoch": 0.6233231707317073, + "grad_norm": 0.05718160449871586, + "kl": 0.07177734375, + "learning_rate": 1.1202587254622845e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4089 + }, + { + "completion_length": 2040.0001220703125, + "epoch": 0.6234756097560976, + "grad_norm": 0.09344554228246825, + "kl": 0.0579833984375, + "learning_rate": 1.1194866113829009e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4090 + }, + { + "completion_length": 2640.3333740234375, + "epoch": 0.6236280487804878, + "grad_norm": 0.09128836791560374, + "kl": 0.0626220703125, + "learning_rate": 1.1187146050435196e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4091 + }, + { + "completion_length": 1849.8333740234375, + "epoch": 0.623780487804878, + "grad_norm": 1.7829828061916277, + "kl": 0.0927734375, + "learning_rate": 1.1179427066627292e-06, + "loss": 0.0037, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4092 + }, + { + "completion_length": 1670.1666870117188, + "epoch": 0.6239329268292683, + "grad_norm": 0.24735667228568559, + "kl": 0.1005859375, + "learning_rate": 1.117170916459088e-06, + "loss": 0.004, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4093 + }, + { + "completion_length": 1305.8333435058594, + "epoch": 0.6240853658536586, + "grad_norm": 0.11421550613539914, + "kl": 0.077880859375, + "learning_rate": 1.1163992346511233e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4094 + }, + { + "completion_length": 2002.666748046875, + "epoch": 0.6242378048780488, + "grad_norm": 0.0736165859569011, + "kl": 0.0633544921875, + "learning_rate": 1.1156276614573328e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4095 + }, + { + "completion_length": 1259.1666870117188, + "epoch": 0.624390243902439, + "grad_norm": 0.12226860622782063, + "kl": 0.0771484375, + "learning_rate": 1.1148561970961818e-06, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4096 + }, + { + "completion_length": 2736.666748046875, + "epoch": 0.6245426829268292, + "grad_norm": 2.062681335773486, + "kl": 0.07958984375, + "learning_rate": 1.1140848417861055e-06, + "loss": 0.0032, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4097 + }, + { + "completion_length": 2389.1666870117188, + "epoch": 0.6246951219512196, + "grad_norm": 0.13243310293713906, + "kl": 0.07421875, + "learning_rate": 1.1133135957455091e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4098 + }, + { + "completion_length": 3103.5, + "epoch": 0.6248475609756098, + "grad_norm": 0.07002704976403201, + "kl": 0.054443359375, + "learning_rate": 1.1125424591927662e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4099 + }, + { + "completion_length": 1194.0000610351562, + "epoch": 0.625, + "grad_norm": 0.1370126674188365, + "kl": 0.08203125, + "learning_rate": 1.1117714323462188e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4100 + }, + { + "completion_length": 2093.166748046875, + "epoch": 0.6251524390243902, + "grad_norm": 0.1404898341636917, + "kl": 0.084228515625, + "learning_rate": 1.111000515424179e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4101 + }, + { + "completion_length": 1839.3334350585938, + "epoch": 0.6253048780487804, + "grad_norm": 2.4918591542206108, + "kl": 0.0966796875, + "learning_rate": 1.110229708644926e-06, + "loss": 0.0039, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 4102 + }, + { + "completion_length": 1631.5000610351562, + "epoch": 0.6254573170731708, + "grad_norm": 0.13463382374250915, + "kl": 0.0477294921875, + "learning_rate": 1.1094590122267097e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4103 + }, + { + "completion_length": 1113.5, + "epoch": 0.625609756097561, + "grad_norm": 0.13640672708458004, + "kl": 0.071044921875, + "learning_rate": 1.1086884263877486e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4104 + }, + { + "completion_length": 2456.3334350585938, + "epoch": 0.6257621951219512, + "grad_norm": 1.7338985963822706, + "kl": 0.0782470703125, + "learning_rate": 1.1079179513462281e-06, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4105 + }, + { + "completion_length": 2629.3333740234375, + "epoch": 0.6259146341463414, + "grad_norm": 0.08083598202546868, + "kl": 0.0673828125, + "learning_rate": 1.1071475873203043e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4106 + }, + { + "completion_length": 2365.0000915527344, + "epoch": 0.6260670731707317, + "grad_norm": 0.08536586230252473, + "kl": 0.052978515625, + "learning_rate": 1.1063773345281003e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4107 + }, + { + "completion_length": 755.5, + "epoch": 0.626219512195122, + "grad_norm": 0.2168457014743105, + "kl": 0.099365234375, + "learning_rate": 1.105607193187709e-06, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4108 + }, + { + "completion_length": 3478.0, + "epoch": 0.6263719512195122, + "grad_norm": 0.05003466633796149, + "kl": 0.049560546875, + "learning_rate": 1.1048371635171905e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4109 + }, + { + "completion_length": 2369.5, + "epoch": 0.6265243902439024, + "grad_norm": 0.10758042890001153, + "kl": 0.0653076171875, + "learning_rate": 1.1040672457345745e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4110 + }, + { + "completion_length": 789.8333435058594, + "epoch": 0.6266768292682927, + "grad_norm": 0.11527355875081029, + "kl": 0.0667724609375, + "learning_rate": 1.1032974400578584e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4111 + }, + { + "completion_length": 1622.8333740234375, + "epoch": 0.6268292682926829, + "grad_norm": 0.2634867126547813, + "kl": 0.088623046875, + "learning_rate": 1.1025277467050079e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4112 + }, + { + "completion_length": 974.3333740234375, + "epoch": 0.6269817073170731, + "grad_norm": 0.13341360856388615, + "kl": 0.086181640625, + "learning_rate": 1.1017581658939564e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4113 + }, + { + "completion_length": 1754.3333740234375, + "epoch": 0.6271341463414634, + "grad_norm": 0.11270053801210525, + "kl": 0.071044921875, + "learning_rate": 1.1009886978426062e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4114 + }, + { + "completion_length": 1106.6666870117188, + "epoch": 0.6272865853658537, + "grad_norm": 1.9933963706141176, + "kl": 0.0833740234375, + "learning_rate": 1.1002193427688278e-06, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4115 + }, + { + "completion_length": 1745.0000915527344, + "epoch": 0.6274390243902439, + "grad_norm": 0.097554283304486, + "kl": 0.0528564453125, + "learning_rate": 1.0994501008904578e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4116 + }, + { + "completion_length": 911.1666870117188, + "epoch": 0.6275914634146341, + "grad_norm": 0.22112510160605084, + "kl": 0.068603515625, + "learning_rate": 1.0986809724253043e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4117 + }, + { + "completion_length": 2230.3333740234375, + "epoch": 0.6277439024390243, + "grad_norm": 0.09144708219941076, + "kl": 0.05810546875, + "learning_rate": 1.0979119575911404e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4118 + }, + { + "completion_length": 1197.0000610351562, + "epoch": 0.6278963414634147, + "grad_norm": 0.10070272906387887, + "kl": 0.0606689453125, + "learning_rate": 1.097143056605708e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4119 + }, + { + "completion_length": 2539.0, + "epoch": 0.6280487804878049, + "grad_norm": 0.0949765322564294, + "kl": 0.0693359375, + "learning_rate": 1.0963742696867162e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4120 + }, + { + "completion_length": 1551.5000610351562, + "epoch": 0.6282012195121951, + "grad_norm": 0.09674013831538716, + "kl": 0.10302734375, + "learning_rate": 1.0956055970518422e-06, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4121 + }, + { + "completion_length": 1932.3333435058594, + "epoch": 0.6283536585365853, + "grad_norm": 0.0904747852587459, + "kl": 0.0582275390625, + "learning_rate": 1.094837038918732e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4122 + }, + { + "completion_length": 2117.5001220703125, + "epoch": 0.6285060975609756, + "grad_norm": 0.07965217050836884, + "kl": 0.0574951171875, + "learning_rate": 1.0940685955049965e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4123 + }, + { + "completion_length": 2604.5, + "epoch": 0.6286585365853659, + "grad_norm": 0.46462657912818073, + "kl": 0.0675048828125, + "learning_rate": 1.093300267028217e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4124 + }, + { + "completion_length": 2010.166748046875, + "epoch": 0.6288109756097561, + "grad_norm": 0.12392999777687468, + "kl": 0.082275390625, + "learning_rate": 1.09253205370594e-06, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4125 + }, + { + "completion_length": 2105.5, + "epoch": 0.6289634146341463, + "grad_norm": 0.1124832589928768, + "kl": 0.073974609375, + "learning_rate": 1.091763955755681e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4126 + }, + { + "completion_length": 3791.5, + "epoch": 0.6291158536585366, + "grad_norm": 0.05703777851481327, + "kl": 0.0699462890625, + "learning_rate": 1.0909959733949214e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4127 + }, + { + "completion_length": 3058.166748046875, + "epoch": 0.6292682926829268, + "grad_norm": 1.1507006446154666, + "kl": 0.06201171875, + "learning_rate": 1.0902281068411114e-06, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4128 + }, + { + "completion_length": 1613.8334350585938, + "epoch": 0.629420731707317, + "grad_norm": 4.220336939720507, + "kl": 0.0859375, + "learning_rate": 1.0894603563116673e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4129 + }, + { + "completion_length": 1117.5, + "epoch": 0.6295731707317073, + "grad_norm": 0.11966708585104861, + "kl": 0.072509765625, + "learning_rate": 1.0886927220239728e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4130 + }, + { + "completion_length": 2903.0, + "epoch": 0.6297256097560976, + "grad_norm": 0.07548559244551967, + "kl": 0.0595703125, + "learning_rate": 1.0879252041953793e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4131 + }, + { + "completion_length": 2428.1666870117188, + "epoch": 0.6298780487804878, + "grad_norm": 0.10764124083478437, + "kl": 0.0540771484375, + "learning_rate": 1.0871578030432038e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4132 + }, + { + "completion_length": 2357.3334350585938, + "epoch": 0.630030487804878, + "grad_norm": 0.07816181422796567, + "kl": 0.0631103515625, + "learning_rate": 1.086390518784732e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4133 + }, + { + "completion_length": 1587.6666870117188, + "epoch": 0.6301829268292682, + "grad_norm": 0.0705662336117904, + "kl": 0.0618896484375, + "learning_rate": 1.0856233516372146e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4134 + }, + { + "completion_length": 2914.0001220703125, + "epoch": 0.6303353658536586, + "grad_norm": 0.11922368835568689, + "kl": 0.07080078125, + "learning_rate": 1.084856301817872e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4135 + }, + { + "completion_length": 1930.5, + "epoch": 0.6304878048780488, + "grad_norm": 0.08161821980123407, + "kl": 0.0615234375, + "learning_rate": 1.084089369543888e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4136 + }, + { + "completion_length": 2280.8333740234375, + "epoch": 0.630640243902439, + "grad_norm": 0.07018686261479198, + "kl": 0.064697265625, + "learning_rate": 1.0833225550324158e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4137 + }, + { + "completion_length": 1127.0, + "epoch": 0.6307926829268292, + "grad_norm": 2.891030572828972, + "kl": 0.072509765625, + "learning_rate": 1.082555858500573e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4138 + }, + { + "completion_length": 807.3333740234375, + "epoch": 0.6309451219512195, + "grad_norm": 2.6889417976366357, + "kl": 0.104736328125, + "learning_rate": 1.0817892801654461e-06, + "loss": 0.0042, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4139 + }, + { + "completion_length": 2952.0, + "epoch": 0.6310975609756098, + "grad_norm": 0.06293620221968985, + "kl": 0.0579833984375, + "learning_rate": 1.0810228202440862e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4140 + }, + { + "completion_length": 1787.3333740234375, + "epoch": 0.63125, + "grad_norm": 0.1636124892571022, + "kl": 0.072509765625, + "learning_rate": 1.080256478953512e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4141 + }, + { + "completion_length": 2267.3333740234375, + "epoch": 0.6314024390243902, + "grad_norm": 0.07116269362209023, + "kl": 0.05078125, + "learning_rate": 1.0794902565107084e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4142 + }, + { + "completion_length": 2478.3334350585938, + "epoch": 0.6315548780487805, + "grad_norm": 0.9040223102534083, + "kl": 0.07568359375, + "learning_rate": 1.078724153132626e-06, + "loss": 0.003, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4143 + }, + { + "completion_length": 1086.5, + "epoch": 0.6317073170731707, + "grad_norm": 0.14523233989127693, + "kl": 0.0814208984375, + "learning_rate": 1.077958169036183e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4144 + }, + { + "completion_length": 1446.3333740234375, + "epoch": 0.631859756097561, + "grad_norm": 0.14631800006816345, + "kl": 0.07958984375, + "learning_rate": 1.0771923044382621e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4145 + }, + { + "completion_length": 1095.3333740234375, + "epoch": 0.6320121951219512, + "grad_norm": 0.10244939148550289, + "kl": 0.07177734375, + "learning_rate": 1.076426559555714e-06, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4146 + }, + { + "completion_length": 2213.8333740234375, + "epoch": 0.6321646341463415, + "grad_norm": 1.1610458326667343, + "kl": 0.0628662109375, + "learning_rate": 1.0756609346053542e-06, + "loss": 0.0025, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4147 + }, + { + "completion_length": 2020.6666870117188, + "epoch": 0.6323170731707317, + "grad_norm": 2.242104826379626, + "kl": 0.073974609375, + "learning_rate": 1.0748954298039644e-06, + "loss": 0.003, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4148 + }, + { + "completion_length": 1890.0001220703125, + "epoch": 0.6324695121951219, + "grad_norm": 0.08955551105955918, + "kl": 0.068115234375, + "learning_rate": 1.0741300453682932e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4149 + }, + { + "completion_length": 1760.1666870117188, + "epoch": 0.6326219512195121, + "grad_norm": 0.22841014930059572, + "kl": 0.091064453125, + "learning_rate": 1.0733647815150536e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4150 + }, + { + "completion_length": 1013.1667175292969, + "epoch": 0.6327743902439025, + "grad_norm": 0.1524669800591918, + "kl": 0.10693359375, + "learning_rate": 1.0725996384609262e-06, + "loss": 0.0043, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4151 + }, + { + "completion_length": 935.8333435058594, + "epoch": 0.6329268292682927, + "grad_norm": 0.20923772724001985, + "kl": 0.07421875, + "learning_rate": 1.0718346164225556e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4152 + }, + { + "completion_length": 1678.8333435058594, + "epoch": 0.6330792682926829, + "grad_norm": 0.12224697413331878, + "kl": 0.0888671875, + "learning_rate": 1.071069715616553e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4153 + }, + { + "completion_length": 1860.5000610351562, + "epoch": 0.6332317073170731, + "grad_norm": 0.09621270591460535, + "kl": 0.0673828125, + "learning_rate": 1.0703049362594967e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4154 + }, + { + "completion_length": 787.0, + "epoch": 0.6333841463414634, + "grad_norm": 0.1086492799605468, + "kl": 0.105224609375, + "learning_rate": 1.069540278567928e-06, + "loss": 0.0042, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4155 + }, + { + "completion_length": 1255.1667175292969, + "epoch": 0.6335365853658537, + "grad_norm": 0.11467986169944969, + "kl": 0.090576171875, + "learning_rate": 1.0687757427583553e-06, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4156 + }, + { + "completion_length": 843.5000305175781, + "epoch": 0.6336890243902439, + "grad_norm": 0.10718533716109603, + "kl": 0.066650390625, + "learning_rate": 1.0680113290472525e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4157 + }, + { + "completion_length": 2552.0001220703125, + "epoch": 0.6338414634146341, + "grad_norm": 1.1660250763275526, + "kl": 0.0628662109375, + "learning_rate": 1.0672470376510577e-06, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4158 + }, + { + "completion_length": 720.3333435058594, + "epoch": 0.6339939024390244, + "grad_norm": 0.17517492827999342, + "kl": 0.080078125, + "learning_rate": 1.066482868786176e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4159 + }, + { + "completion_length": 1287.0000610351562, + "epoch": 0.6341463414634146, + "grad_norm": 0.10947147576296658, + "kl": 0.07421875, + "learning_rate": 1.0657188226689772e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4160 + }, + { + "completion_length": 850.8333435058594, + "epoch": 0.6342987804878049, + "grad_norm": 0.1295941873622637, + "kl": 0.07666015625, + "learning_rate": 1.0649548995157956e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4161 + }, + { + "completion_length": 2057.0000610351562, + "epoch": 0.6344512195121951, + "grad_norm": 0.08763200222045534, + "kl": 0.06298828125, + "learning_rate": 1.0641910995429317e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4162 + }, + { + "completion_length": 994.0, + "epoch": 0.6346036585365854, + "grad_norm": 0.1290037117000964, + "kl": 0.08837890625, + "learning_rate": 1.0634274229666507e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4163 + }, + { + "completion_length": 2115.3333740234375, + "epoch": 0.6347560975609756, + "grad_norm": 0.10864821360145048, + "kl": 0.072509765625, + "learning_rate": 1.0626638700031825e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4164 + }, + { + "completion_length": 1093.8333435058594, + "epoch": 0.6349085365853658, + "grad_norm": 0.131808482166016, + "kl": 0.0689697265625, + "learning_rate": 1.0619004408687228e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4165 + }, + { + "completion_length": 2115.3334350585938, + "epoch": 0.635060975609756, + "grad_norm": 0.1367313931831311, + "kl": 0.073974609375, + "learning_rate": 1.0611371357794317e-06, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4166 + }, + { + "completion_length": 2509.3333740234375, + "epoch": 0.6352134146341464, + "grad_norm": 0.06269212256873449, + "kl": 0.0616455078125, + "learning_rate": 1.0603739549514342e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4167 + }, + { + "completion_length": 950.0000305175781, + "epoch": 0.6353658536585366, + "grad_norm": 0.09615000110623043, + "kl": 0.0631103515625, + "learning_rate": 1.0596108986008203e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4168 + }, + { + "completion_length": 981.0, + "epoch": 0.6355182926829268, + "grad_norm": 0.1763866478333921, + "kl": 0.073974609375, + "learning_rate": 1.058847966943645e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4169 + }, + { + "completion_length": 2416.5, + "epoch": 0.635670731707317, + "grad_norm": 0.14010277026191276, + "kl": 0.0679931640625, + "learning_rate": 1.0580851601959268e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4170 + }, + { + "completion_length": 2975.166748046875, + "epoch": 0.6358231707317074, + "grad_norm": 0.8927230052667448, + "kl": 0.06103515625, + "learning_rate": 1.05732247857365e-06, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4171 + }, + { + "completion_length": 1115.0000305175781, + "epoch": 0.6359756097560976, + "grad_norm": 0.10182743644648437, + "kl": 0.0623779296875, + "learning_rate": 1.0565599222927637e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4172 + }, + { + "completion_length": 704.6666870117188, + "epoch": 0.6361280487804878, + "grad_norm": 0.10607885918691973, + "kl": 0.0791015625, + "learning_rate": 1.0557974915691808e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4173 + }, + { + "completion_length": 1687.5000610351562, + "epoch": 0.636280487804878, + "grad_norm": 0.09273438632475152, + "kl": 0.0606689453125, + "learning_rate": 1.0550351866187783e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4174 + }, + { + "completion_length": 1617.5000915527344, + "epoch": 0.6364329268292683, + "grad_norm": 0.08715257205215039, + "kl": 0.050048828125, + "learning_rate": 1.054273007657399e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4175 + }, + { + "completion_length": 1474.0, + "epoch": 0.6365853658536585, + "grad_norm": 0.12424936891135602, + "kl": 0.0712890625, + "learning_rate": 1.0535109549008482e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4176 + }, + { + "completion_length": 1724.166748046875, + "epoch": 0.6367378048780488, + "grad_norm": 0.0858353341192742, + "kl": 0.0693359375, + "learning_rate": 1.0527490285648967e-06, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4177 + }, + { + "completion_length": 942.0000610351562, + "epoch": 0.636890243902439, + "grad_norm": 0.08841859358072372, + "kl": 0.0625, + "learning_rate": 1.05198722886528e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4178 + }, + { + "completion_length": 1421.0, + "epoch": 0.6370426829268293, + "grad_norm": 0.10218160388675984, + "kl": 0.06982421875, + "learning_rate": 1.0512255560176955e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4179 + }, + { + "completion_length": 1200.3333740234375, + "epoch": 0.6371951219512195, + "grad_norm": 1.4929321899741814, + "kl": 0.08984375, + "learning_rate": 1.0504640102378075e-06, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4180 + }, + { + "completion_length": 426.3333435058594, + "epoch": 0.6373475609756097, + "grad_norm": 0.1763220035197915, + "kl": 0.0552978515625, + "learning_rate": 1.049702591741242e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4181 + }, + { + "completion_length": 1369.8333740234375, + "epoch": 0.6375, + "grad_norm": 0.1173796464504609, + "kl": 0.076171875, + "learning_rate": 1.0489413007435905e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4182 + }, + { + "completion_length": 917.6666870117188, + "epoch": 0.6376524390243903, + "grad_norm": 0.3034634891577468, + "kl": 0.07568359375, + "learning_rate": 1.0481801374604073e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4183 + }, + { + "completion_length": 1449.666748046875, + "epoch": 0.6378048780487805, + "grad_norm": 0.10135522673165524, + "kl": 0.068359375, + "learning_rate": 1.0474191021072117e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4184 + }, + { + "completion_length": 1533.1666870117188, + "epoch": 0.6379573170731707, + "grad_norm": 0.11926611382756645, + "kl": 0.076171875, + "learning_rate": 1.0466581948994857e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4185 + }, + { + "completion_length": 924.6666870117188, + "epoch": 0.6381097560975609, + "grad_norm": 0.10413180870426457, + "kl": 0.0716552734375, + "learning_rate": 1.0458974160526752e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4186 + }, + { + "completion_length": 1996.5000610351562, + "epoch": 0.6382621951219513, + "grad_norm": 2.1262139651244483, + "kl": 0.085205078125, + "learning_rate": 1.0451367657821911e-06, + "loss": 0.0034, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 4187 + }, + { + "completion_length": 948.5000305175781, + "epoch": 0.6384146341463415, + "grad_norm": 0.12691021881127212, + "kl": 0.0584716796875, + "learning_rate": 1.0443762443034054e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4188 + }, + { + "completion_length": 1860.5000610351562, + "epoch": 0.6385670731707317, + "grad_norm": 0.08043634800941671, + "kl": 0.0760498046875, + "learning_rate": 1.043615851831656e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4189 + }, + { + "completion_length": 1357.6666870117188, + "epoch": 0.6387195121951219, + "grad_norm": 0.07770977469855532, + "kl": 0.0657958984375, + "learning_rate": 1.042855588582242e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4190 + }, + { + "completion_length": 1284.0000610351562, + "epoch": 0.6388719512195122, + "grad_norm": 0.13749106762144167, + "kl": 0.063232421875, + "learning_rate": 1.0420954547704292e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4191 + }, + { + "completion_length": 1792.1666870117188, + "epoch": 0.6390243902439025, + "grad_norm": 0.07452007464568758, + "kl": 0.0433349609375, + "learning_rate": 1.0413354506114434e-06, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4192 + }, + { + "completion_length": 857.1666870117188, + "epoch": 0.6391768292682927, + "grad_norm": 0.10184754128184094, + "kl": 0.058349609375, + "learning_rate": 1.040575576320476e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4193 + }, + { + "completion_length": 894.0000305175781, + "epoch": 0.6393292682926829, + "grad_norm": 0.1938524490371804, + "kl": 0.06494140625, + "learning_rate": 1.0398158321126793e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4194 + }, + { + "completion_length": 637.5000305175781, + "epoch": 0.6394817073170732, + "grad_norm": 0.31344441590536126, + "kl": 0.065185546875, + "learning_rate": 1.0390562182031716e-06, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4195 + }, + { + "completion_length": 1481.5000305175781, + "epoch": 0.6396341463414634, + "grad_norm": 0.10056133842597409, + "kl": 0.057373046875, + "learning_rate": 1.0382967348070315e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4196 + }, + { + "completion_length": 873.1666870117188, + "epoch": 0.6397865853658536, + "grad_norm": 0.10573091584649758, + "kl": 0.078857421875, + "learning_rate": 1.0375373821393031e-06, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4197 + }, + { + "completion_length": 979.6666870117188, + "epoch": 0.6399390243902439, + "grad_norm": 0.11439265331648947, + "kl": 0.0555419921875, + "learning_rate": 1.0367781604149923e-06, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4198 + }, + { + "completion_length": 563.8333435058594, + "epoch": 0.6400914634146342, + "grad_norm": 0.12746373921823512, + "kl": 0.0682373046875, + "learning_rate": 1.0360190698490675e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4199 + }, + { + "completion_length": 2027.5000915527344, + "epoch": 0.6402439024390244, + "grad_norm": 0.07634705324531987, + "kl": 0.0604248046875, + "learning_rate": 1.0352601106564607e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4200 + }, + { + "completion_length": 1847.0000915527344, + "epoch": 0.6403963414634146, + "grad_norm": 0.13858156910953084, + "kl": 0.060302734375, + "learning_rate": 1.0345012830520662e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4201 + }, + { + "completion_length": 1367.6667175292969, + "epoch": 0.6405487804878048, + "grad_norm": 0.1085805709627439, + "kl": 0.0703125, + "learning_rate": 1.0337425872507422e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4202 + }, + { + "completion_length": 1766.166748046875, + "epoch": 0.6407012195121952, + "grad_norm": 0.10544564407745963, + "kl": 0.051025390625, + "learning_rate": 1.0329840234673077e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4203 + }, + { + "completion_length": 2015.0, + "epoch": 0.6408536585365854, + "grad_norm": 0.943930573096801, + "kl": 0.071533203125, + "learning_rate": 1.0322255919165456e-06, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4204 + }, + { + "completion_length": 691.6666717529297, + "epoch": 0.6410060975609756, + "grad_norm": 0.19262257346557127, + "kl": 0.053955078125, + "learning_rate": 1.0314672928132017e-06, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4205 + }, + { + "completion_length": 2089.666748046875, + "epoch": 0.6411585365853658, + "grad_norm": 0.11825302138298512, + "kl": 0.0830078125, + "learning_rate": 1.0307091263719828e-06, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4206 + }, + { + "completion_length": 1256.3333740234375, + "epoch": 0.6413109756097561, + "grad_norm": 1.2702062213993544, + "kl": 0.07763671875, + "learning_rate": 1.0299510928075596e-06, + "loss": 0.0031, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4207 + }, + { + "completion_length": 507.33335876464844, + "epoch": 0.6414634146341464, + "grad_norm": 2.386374817458936, + "kl": 0.0927734375, + "learning_rate": 1.0291931923345635e-06, + "loss": 0.0037, + "reward": 0.8333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/format_reward": 0.0, + "step": 4208 + }, + { + "completion_length": 1881.5000915527344, + "epoch": 0.6416158536585366, + "grad_norm": 0.09578490800453461, + "kl": 0.068115234375, + "learning_rate": 1.0284354251675914e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4209 + }, + { + "completion_length": 1040.1666870117188, + "epoch": 0.6417682926829268, + "grad_norm": 0.1020465763196924, + "kl": 0.0526123046875, + "learning_rate": 1.0276777915211984e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4210 + }, + { + "completion_length": 1027.0, + "epoch": 0.6419207317073171, + "grad_norm": 0.11234323873291183, + "kl": 0.06787109375, + "learning_rate": 1.026920291609905e-06, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4211 + }, + { + "completion_length": 1415.5000915527344, + "epoch": 0.6420731707317073, + "grad_norm": 0.12431472117882228, + "kl": 0.05029296875, + "learning_rate": 1.0261629256481923e-06, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4212 + }, + { + "completion_length": 2277.8333435058594, + "epoch": 0.6422256097560975, + "grad_norm": 0.10407683341649326, + "kl": 0.0635986328125, + "learning_rate": 1.0254056938505034e-06, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4213 + }, + { + "completion_length": 1510.5000610351562, + "epoch": 0.6423780487804878, + "grad_norm": 0.1658815032505037, + "kl": 0.075927734375, + "learning_rate": 1.024648596431244e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4214 + }, + { + "completion_length": 2638.0, + "epoch": 0.6425304878048781, + "grad_norm": 0.09541145325582542, + "kl": 0.060546875, + "learning_rate": 1.0238916336047812e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4215 + }, + { + "completion_length": 3048.1666870117188, + "epoch": 0.6426829268292683, + "grad_norm": 0.07293117527128891, + "kl": 0.0526123046875, + "learning_rate": 1.0231348055854452e-06, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4216 + }, + { + "completion_length": 2940.666748046875, + "epoch": 0.6428353658536585, + "grad_norm": 0.9491502491589769, + "kl": 0.0633544921875, + "learning_rate": 1.0223781125875261e-06, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4217 + }, + { + "completion_length": 1093.5, + "epoch": 0.6429878048780487, + "grad_norm": 0.10385378690015769, + "kl": 0.0565185546875, + "learning_rate": 1.0216215548252777e-06, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4218 + }, + { + "completion_length": 953.8333740234375, + "epoch": 0.6431402439024391, + "grad_norm": 2.53696981599435, + "kl": 0.066650390625, + "learning_rate": 1.020865132512914e-06, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4219 + }, + { + "completion_length": 2514.0, + "epoch": 0.6432926829268293, + "grad_norm": 0.08589815445177579, + "kl": 0.0838623046875, + "learning_rate": 1.0201088458646118e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4220 + }, + { + "completion_length": 991.0000305175781, + "epoch": 0.6434451219512195, + "grad_norm": 0.8918677329514669, + "kl": 0.083984375, + "learning_rate": 1.0193526950945083e-06, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4221 + }, + { + "completion_length": 2621.3333740234375, + "epoch": 0.6435975609756097, + "grad_norm": 0.09104494938348649, + "kl": 0.0487060546875, + "learning_rate": 1.0185966804167033e-06, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4222 + }, + { + "completion_length": 1695.666748046875, + "epoch": 0.64375, + "grad_norm": 0.09546714006511951, + "kl": 0.0599365234375, + "learning_rate": 1.017840802045258e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4223 + }, + { + "completion_length": 1975.1667175292969, + "epoch": 0.6439024390243903, + "grad_norm": 0.11241557223689908, + "kl": 0.0712890625, + "learning_rate": 1.0170850601941937e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4224 + }, + { + "completion_length": 1666.3333740234375, + "epoch": 0.6440548780487805, + "grad_norm": 2.3621683600144294, + "kl": 0.1064453125, + "learning_rate": 1.0163294550774954e-06, + "loss": 0.0043, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4225 + }, + { + "completion_length": 2749.666748046875, + "epoch": 0.6442073170731707, + "grad_norm": 0.06821923302507428, + "kl": 0.064697265625, + "learning_rate": 1.0155739869091068e-06, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4226 + }, + { + "completion_length": 2079.3334350585938, + "epoch": 0.644359756097561, + "grad_norm": 0.2497906756562736, + "kl": 0.0635986328125, + "learning_rate": 1.0148186559029338e-06, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4227 + }, + { + "completion_length": 1378.5000610351562, + "epoch": 0.6445121951219512, + "grad_norm": 0.13311983590511353, + "kl": 0.0906982421875, + "learning_rate": 1.0140634622728447e-06, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4228 + }, + { + "completion_length": 598.0, + "epoch": 0.6446646341463415, + "grad_norm": 0.20893266063557733, + "kl": 0.13134765625, + "learning_rate": 1.013308406232668e-06, + "loss": 0.0053, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4229 + }, + { + "completion_length": 2478.5000915527344, + "epoch": 0.6448170731707317, + "grad_norm": 0.11863558851544269, + "kl": 0.0692138671875, + "learning_rate": 1.0125534879961923e-06, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4230 + }, + { + "completion_length": 1689.8333435058594, + "epoch": 0.644969512195122, + "grad_norm": 0.1229461258042076, + "kl": 0.07861328125, + "learning_rate": 1.0117987077771685e-06, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4231 + }, + { + "completion_length": 796.8333435058594, + "epoch": 0.6451219512195122, + "grad_norm": 0.10274091729876925, + "kl": 0.060546875, + "learning_rate": 1.0110440657893074e-06, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4232 + }, + { + "completion_length": 1733.5000610351562, + "epoch": 0.6452743902439024, + "grad_norm": 0.09654331138104336, + "kl": 0.05029296875, + "learning_rate": 1.010289562246282e-06, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4233 + }, + { + "completion_length": 1222.1667175292969, + "epoch": 0.6454268292682926, + "grad_norm": 0.3211325640668102, + "kl": 0.09423828125, + "learning_rate": 1.0095351973617245e-06, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4234 + }, + { + "completion_length": 1230.5000610351562, + "epoch": 0.645579268292683, + "grad_norm": 2.18508859498144, + "kl": 0.118896484375, + "learning_rate": 1.0087809713492288e-06, + "loss": 0.0048, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4235 + }, + { + "completion_length": 758.6666870117188, + "epoch": 0.6457317073170732, + "grad_norm": 0.13961089504178978, + "kl": 0.072265625, + "learning_rate": 1.00802688442235e-06, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4236 + }, + { + "completion_length": 1590.5, + "epoch": 0.6458841463414634, + "grad_norm": 0.43021473608821775, + "kl": 0.091064453125, + "learning_rate": 1.0072729367946024e-06, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4237 + }, + { + "completion_length": 1191.6666870117188, + "epoch": 0.6460365853658536, + "grad_norm": 1.598807887027324, + "kl": 0.0750732421875, + "learning_rate": 1.0065191286794618e-06, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4238 + }, + { + "completion_length": 2388.8333740234375, + "epoch": 0.646189024390244, + "grad_norm": 0.09890921598908435, + "kl": 0.0594482421875, + "learning_rate": 1.005765460290364e-06, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4239 + }, + { + "completion_length": 1774.0000610351562, + "epoch": 0.6463414634146342, + "grad_norm": 0.15199239634238462, + "kl": 0.068115234375, + "learning_rate": 1.0050119318407061e-06, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4240 + }, + { + "completion_length": 1406.1666870117188, + "epoch": 0.6464939024390244, + "grad_norm": 1.8703212118079076, + "kl": 0.085693359375, + "learning_rate": 1.0042585435438446e-06, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4241 + }, + { + "completion_length": 974.8333740234375, + "epoch": 0.6466463414634146, + "grad_norm": 0.12402319867405838, + "kl": 0.0888671875, + "learning_rate": 1.0035052956130967e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4242 + }, + { + "completion_length": 1492.0000610351562, + "epoch": 0.6467987804878049, + "grad_norm": 0.10469101745809274, + "kl": 0.075439453125, + "learning_rate": 1.0027521882617405e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4243 + }, + { + "completion_length": 1314.0000305175781, + "epoch": 0.6469512195121951, + "grad_norm": 0.11616653502794372, + "kl": 0.088134765625, + "learning_rate": 1.0019992217030127e-06, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4244 + }, + { + "completion_length": 1265.3333435058594, + "epoch": 0.6471036585365854, + "grad_norm": 0.11484947047166871, + "kl": 0.075927734375, + "learning_rate": 1.0012463961501112e-06, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4245 + }, + { + "completion_length": 1570.0000610351562, + "epoch": 0.6472560975609756, + "grad_norm": 0.08677152063960893, + "kl": 0.05712890625, + "learning_rate": 1.0004937118161948e-06, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4246 + }, + { + "completion_length": 1855.5001220703125, + "epoch": 0.6474085365853659, + "grad_norm": 0.0988525300882974, + "kl": 0.079833984375, + "learning_rate": 9.997411689143807e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4247 + }, + { + "completion_length": 1424.8333740234375, + "epoch": 0.6475609756097561, + "grad_norm": 2.112208855018012, + "kl": 0.08203125, + "learning_rate": 9.98988767657747e-07, + "loss": 0.0033, + "reward": 0.8333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/format_reward": 0.0, + "step": 4248 + }, + { + "completion_length": 2512.8333740234375, + "epoch": 0.6477134146341463, + "grad_norm": 0.08175324274059714, + "kl": 0.0645751953125, + "learning_rate": 9.982365082593313e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4249 + }, + { + "completion_length": 1592.8333435058594, + "epoch": 0.6478658536585366, + "grad_norm": 0.09462146504202866, + "kl": 0.085205078125, + "learning_rate": 9.974843909321313e-07, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4250 + }, + { + "completion_length": 1173.5000610351562, + "epoch": 0.6480182926829269, + "grad_norm": 1.8542225462219901, + "kl": 0.0838623046875, + "learning_rate": 9.96732415889104e-07, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4251 + }, + { + "completion_length": 2621.3333740234375, + "epoch": 0.6481707317073171, + "grad_norm": 0.06381477171645163, + "kl": 0.0543212890625, + "learning_rate": 9.95980583343167e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4252 + }, + { + "completion_length": 857.1666870117188, + "epoch": 0.6483231707317073, + "grad_norm": 2.594863630478163, + "kl": 0.094970703125, + "learning_rate": 9.952288935071962e-07, + "loss": 0.0038, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4253 + }, + { + "completion_length": 787.6666870117188, + "epoch": 0.6484756097560975, + "grad_norm": 0.31872864809150264, + "kl": 0.072021484375, + "learning_rate": 9.94477346594029e-07, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4254 + }, + { + "completion_length": 1731.6666870117188, + "epoch": 0.6486280487804879, + "grad_norm": 0.12981151726669263, + "kl": 0.06982421875, + "learning_rate": 9.937259428164601e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4255 + }, + { + "completion_length": 1473.5, + "epoch": 0.6487804878048781, + "grad_norm": 1.992933583342477, + "kl": 0.0869140625, + "learning_rate": 9.929746823872462e-07, + "loss": 0.0035, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4256 + }, + { + "completion_length": 602.1666870117188, + "epoch": 0.6489329268292683, + "grad_norm": 0.11415871504228002, + "kl": 0.0589599609375, + "learning_rate": 9.922235655191006e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4257 + }, + { + "completion_length": 1001.0, + "epoch": 0.6490853658536585, + "grad_norm": 0.13828455327337236, + "kl": 0.0625, + "learning_rate": 9.914725924246984e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4258 + }, + { + "completion_length": 1230.0, + "epoch": 0.6492378048780488, + "grad_norm": 0.15939038493287086, + "kl": 0.100830078125, + "learning_rate": 9.90721763316672e-07, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4259 + }, + { + "completion_length": 1053.1666870117188, + "epoch": 0.649390243902439, + "grad_norm": 2.1460697275133227, + "kl": 0.0821533203125, + "learning_rate": 9.899710784076147e-07, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4260 + }, + { + "completion_length": 636.0, + "epoch": 0.6495426829268293, + "grad_norm": 2.35147541722598, + "kl": 0.0982666015625, + "learning_rate": 9.892205379100785e-07, + "loss": 0.0039, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4261 + }, + { + "completion_length": 1017.5000610351562, + "epoch": 0.6496951219512195, + "grad_norm": 0.1164384591336651, + "kl": 0.07666015625, + "learning_rate": 9.884701420365734e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4262 + }, + { + "completion_length": 1781.3333740234375, + "epoch": 0.6498475609756098, + "grad_norm": 0.09250676859074254, + "kl": 0.0791015625, + "learning_rate": 9.877198909995696e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4263 + }, + { + "completion_length": 967.5000305175781, + "epoch": 0.65, + "grad_norm": 0.09266419440991899, + "kl": 0.0601806640625, + "learning_rate": 9.86969785011497e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4264 + }, + { + "completion_length": 1533.8333435058594, + "epoch": 0.6501524390243902, + "grad_norm": 0.2462537252100617, + "kl": 0.07177734375, + "learning_rate": 9.86219824284742e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4265 + }, + { + "completion_length": 1065.3333740234375, + "epoch": 0.6503048780487805, + "grad_norm": 0.13687848276209613, + "kl": 0.100341796875, + "learning_rate": 9.854700090316526e-07, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4266 + }, + { + "completion_length": 1029.1667175292969, + "epoch": 0.6504573170731708, + "grad_norm": 0.09092608778153831, + "kl": 0.06298828125, + "learning_rate": 9.847203394645337e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4267 + }, + { + "completion_length": 1360.5, + "epoch": 0.650609756097561, + "grad_norm": 2.5404619465852982, + "kl": 0.112060546875, + "learning_rate": 9.839708157956493e-07, + "loss": 0.0045, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4268 + }, + { + "completion_length": 629.1666870117188, + "epoch": 0.6507621951219512, + "grad_norm": 0.15529871120428201, + "kl": 0.075439453125, + "learning_rate": 9.83221438237223e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4269 + }, + { + "completion_length": 1765.666748046875, + "epoch": 0.6509146341463414, + "grad_norm": 1.1546526009638747, + "kl": 0.090087890625, + "learning_rate": 9.82472207001436e-07, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4270 + }, + { + "completion_length": 1111.1666870117188, + "epoch": 0.6510670731707318, + "grad_norm": 0.11050362199541673, + "kl": 0.07666015625, + "learning_rate": 9.817231223004285e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4271 + }, + { + "completion_length": 2592.5, + "epoch": 0.651219512195122, + "grad_norm": 1.7283058155476296, + "kl": 0.072265625, + "learning_rate": 9.809741843462994e-07, + "loss": 0.0029, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4272 + }, + { + "completion_length": 976.3333740234375, + "epoch": 0.6513719512195122, + "grad_norm": 0.11994224152054556, + "kl": 0.0845947265625, + "learning_rate": 9.802253933511056e-07, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4273 + }, + { + "completion_length": 1357.5000610351562, + "epoch": 0.6515243902439024, + "grad_norm": 0.10774193018154958, + "kl": 0.081298828125, + "learning_rate": 9.794767495268627e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4274 + }, + { + "completion_length": 1356.5000610351562, + "epoch": 0.6516768292682927, + "grad_norm": 0.1477367622606502, + "kl": 0.07177734375, + "learning_rate": 9.787282530855443e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4275 + }, + { + "completion_length": 1076.5, + "epoch": 0.651829268292683, + "grad_norm": 0.13106302277244525, + "kl": 0.08251953125, + "learning_rate": 9.779799042390833e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4276 + }, + { + "completion_length": 1018.6666870117188, + "epoch": 0.6519817073170732, + "grad_norm": 0.0908692302540161, + "kl": 0.07177734375, + "learning_rate": 9.77231703199369e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4277 + }, + { + "completion_length": 1057.1666870117188, + "epoch": 0.6521341463414634, + "grad_norm": 0.18013161775536174, + "kl": 0.09814453125, + "learning_rate": 9.764836501782503e-07, + "loss": 0.0039, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4278 + }, + { + "completion_length": 1526.5000915527344, + "epoch": 0.6522865853658537, + "grad_norm": 0.1050453348438406, + "kl": 0.0675048828125, + "learning_rate": 9.757357453875343e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4279 + }, + { + "completion_length": 1038.3333740234375, + "epoch": 0.6524390243902439, + "grad_norm": 0.12839945342934786, + "kl": 0.09375, + "learning_rate": 9.749879890389848e-07, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4280 + }, + { + "completion_length": 1614.6666870117188, + "epoch": 0.6525914634146341, + "grad_norm": 0.13550141292080162, + "kl": 0.08447265625, + "learning_rate": 9.74240381344325e-07, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4281 + }, + { + "completion_length": 950.3333740234375, + "epoch": 0.6527439024390244, + "grad_norm": 0.11491272768682559, + "kl": 0.071044921875, + "learning_rate": 9.734929225152345e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4282 + }, + { + "completion_length": 1372.0000305175781, + "epoch": 0.6528963414634147, + "grad_norm": 2.304490593106153, + "kl": 0.103759765625, + "learning_rate": 9.727456127633523e-07, + "loss": 0.0042, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 4283 + }, + { + "completion_length": 865.8333435058594, + "epoch": 0.6530487804878049, + "grad_norm": 0.20493484494909092, + "kl": 0.093994140625, + "learning_rate": 9.719984523002745e-07, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4284 + }, + { + "completion_length": 1693.0001220703125, + "epoch": 0.6532012195121951, + "grad_norm": 0.10128653229163839, + "kl": 0.08203125, + "learning_rate": 9.712514413375552e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4285 + }, + { + "completion_length": 1488.6666870117188, + "epoch": 0.6533536585365853, + "grad_norm": 1.2085531541094188, + "kl": 0.069580078125, + "learning_rate": 9.705045800867052e-07, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4286 + }, + { + "completion_length": 1129.0000305175781, + "epoch": 0.6535060975609757, + "grad_norm": 0.0866915585942528, + "kl": 0.0623779296875, + "learning_rate": 9.697578687591944e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4287 + }, + { + "completion_length": 4096.0, + "epoch": 0.6536585365853659, + "grad_norm": 0.056994539293947016, + "kl": 0.0545654296875, + "learning_rate": 9.690113075664488e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4288 + }, + { + "completion_length": 1085.8333435058594, + "epoch": 0.6538109756097561, + "grad_norm": 0.13404952346811086, + "kl": 0.0771484375, + "learning_rate": 9.68264896719853e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4289 + }, + { + "completion_length": 1935.166748046875, + "epoch": 0.6539634146341463, + "grad_norm": 0.11962837847088764, + "kl": 0.06689453125, + "learning_rate": 9.67518636430748e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4290 + }, + { + "completion_length": 817.3333435058594, + "epoch": 0.6541158536585366, + "grad_norm": 0.16394932318003533, + "kl": 0.08740234375, + "learning_rate": 9.667725269104332e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4291 + }, + { + "completion_length": 1366.5000610351562, + "epoch": 0.6542682926829269, + "grad_norm": 0.12641145989858835, + "kl": 0.0732421875, + "learning_rate": 9.660265683701652e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4292 + }, + { + "completion_length": 2317.0000610351562, + "epoch": 0.6544207317073171, + "grad_norm": 0.0988359964477372, + "kl": 0.0755615234375, + "learning_rate": 9.652807610211569e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4293 + }, + { + "completion_length": 2069.8334350585938, + "epoch": 0.6545731707317073, + "grad_norm": 0.16137262012491527, + "kl": 0.07763671875, + "learning_rate": 9.645351050745796e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4294 + }, + { + "completion_length": 3458.666748046875, + "epoch": 0.6547256097560976, + "grad_norm": 0.08987480038792714, + "kl": 0.073486328125, + "learning_rate": 9.637896007415608e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4295 + }, + { + "completion_length": 1369.8333740234375, + "epoch": 0.6548780487804878, + "grad_norm": 1.4001871951106155, + "kl": 0.081298828125, + "learning_rate": 9.630442482331853e-07, + "loss": 0.0033, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4296 + }, + { + "completion_length": 2711.166748046875, + "epoch": 0.655030487804878, + "grad_norm": 0.09606598535114907, + "kl": 0.0638427734375, + "learning_rate": 9.622990477604953e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4297 + }, + { + "completion_length": 1027.0, + "epoch": 0.6551829268292683, + "grad_norm": 0.13015533628199036, + "kl": 0.06884765625, + "learning_rate": 9.615539995344894e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4298 + }, + { + "completion_length": 3674.166748046875, + "epoch": 0.6553353658536586, + "grad_norm": 0.07696586681573823, + "kl": 0.050537109375, + "learning_rate": 9.60809103766124e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4299 + }, + { + "completion_length": 2478.166748046875, + "epoch": 0.6554878048780488, + "grad_norm": 0.08412829369197071, + "kl": 0.058837890625, + "learning_rate": 9.600643606663104e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4300 + }, + { + "completion_length": 2923.3334350585938, + "epoch": 0.655640243902439, + "grad_norm": 0.08735387928030822, + "kl": 0.0657958984375, + "learning_rate": 9.593197704459194e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4301 + }, + { + "completion_length": 2955.666748046875, + "epoch": 0.6557926829268292, + "grad_norm": 0.10316485044388614, + "kl": 0.064697265625, + "learning_rate": 9.585753333157764e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4302 + }, + { + "completion_length": 1039.6667175292969, + "epoch": 0.6559451219512196, + "grad_norm": 0.13174119738744042, + "kl": 0.0589599609375, + "learning_rate": 9.57831049486665e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4303 + }, + { + "completion_length": 2736.5001220703125, + "epoch": 0.6560975609756098, + "grad_norm": 0.0888407876636123, + "kl": 0.0550537109375, + "learning_rate": 9.57086919169323e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4304 + }, + { + "completion_length": 4096.0, + "epoch": 0.65625, + "grad_norm": 0.055249466841010765, + "kl": 0.0455322265625, + "learning_rate": 9.563429425744476e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4305 + }, + { + "completion_length": 2029.166748046875, + "epoch": 0.6564024390243902, + "grad_norm": 0.08740905054157856, + "kl": 0.065673828125, + "learning_rate": 9.555991199126904e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4306 + }, + { + "completion_length": 3176.3333740234375, + "epoch": 0.6565548780487804, + "grad_norm": 0.07144649597435146, + "kl": 0.060302734375, + "learning_rate": 9.548554513946608e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4307 + }, + { + "completion_length": 3524.5, + "epoch": 0.6567073170731708, + "grad_norm": 0.05318874742342914, + "kl": 0.0521240234375, + "learning_rate": 9.541119372309233e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4308 + }, + { + "completion_length": 2839.8333740234375, + "epoch": 0.656859756097561, + "grad_norm": 0.06433893498951795, + "kl": 0.0604248046875, + "learning_rate": 9.533685776319993e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4309 + }, + { + "completion_length": 2710.5, + "epoch": 0.6570121951219512, + "grad_norm": 1.342840638519277, + "kl": 0.0703125, + "learning_rate": 9.526253728083674e-07, + "loss": 0.0028, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4310 + }, + { + "completion_length": 2630.166748046875, + "epoch": 0.6571646341463414, + "grad_norm": 0.07890908040902714, + "kl": 0.06005859375, + "learning_rate": 9.518823229704601e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4311 + }, + { + "completion_length": 3482.0, + "epoch": 0.6573170731707317, + "grad_norm": 0.06653725057477239, + "kl": 0.048828125, + "learning_rate": 9.511394283286686e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4312 + }, + { + "completion_length": 2564.6666870117188, + "epoch": 0.657469512195122, + "grad_norm": 0.07084160110410936, + "kl": 0.0574951171875, + "learning_rate": 9.503966890933377e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4313 + }, + { + "completion_length": 952.1666870117188, + "epoch": 0.6576219512195122, + "grad_norm": 0.11813429441263357, + "kl": 0.068359375, + "learning_rate": 9.496541054747705e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4314 + }, + { + "completion_length": 2359.8334350585938, + "epoch": 0.6577743902439024, + "grad_norm": 0.12422237934291236, + "kl": 0.062744140625, + "learning_rate": 9.489116776832242e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4315 + }, + { + "completion_length": 1400.6667175292969, + "epoch": 0.6579268292682927, + "grad_norm": 0.07309194741152429, + "kl": 0.051025390625, + "learning_rate": 9.481694059289126e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4316 + }, + { + "completion_length": 3889.8333740234375, + "epoch": 0.6580792682926829, + "grad_norm": 0.03518619181592313, + "kl": 0.0477294921875, + "learning_rate": 9.474272904220061e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4317 + }, + { + "completion_length": 1662.6666870117188, + "epoch": 0.6582317073170731, + "grad_norm": 0.10947991549654298, + "kl": 0.064453125, + "learning_rate": 9.466853313726293e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4318 + }, + { + "completion_length": 2389.666748046875, + "epoch": 0.6583841463414634, + "grad_norm": 0.0804174897690488, + "kl": 0.0521240234375, + "learning_rate": 9.459435289908632e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4319 + }, + { + "completion_length": 1235.6666870117188, + "epoch": 0.6585365853658537, + "grad_norm": 0.11712992617991702, + "kl": 0.09765625, + "learning_rate": 9.452018834867454e-07, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4320 + }, + { + "completion_length": 2532.5001220703125, + "epoch": 0.6586890243902439, + "grad_norm": 0.12947210906549236, + "kl": 0.056396484375, + "learning_rate": 9.444603950702677e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4321 + }, + { + "completion_length": 2040.1666870117188, + "epoch": 0.6588414634146341, + "grad_norm": 0.10450623826461528, + "kl": 0.0628662109375, + "learning_rate": 9.437190639513778e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4322 + }, + { + "completion_length": 2363.166748046875, + "epoch": 0.6589939024390243, + "grad_norm": 0.06791323321285127, + "kl": 0.05615234375, + "learning_rate": 9.429778903399796e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4323 + }, + { + "completion_length": 1955.666748046875, + "epoch": 0.6591463414634147, + "grad_norm": 0.16180615986807434, + "kl": 0.06591796875, + "learning_rate": 9.422368744459309e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4324 + }, + { + "completion_length": 1551.0000610351562, + "epoch": 0.6592987804878049, + "grad_norm": 1.4608674940280968, + "kl": 0.073486328125, + "learning_rate": 9.414960164790468e-07, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4325 + }, + { + "completion_length": 1459.0000610351562, + "epoch": 0.6594512195121951, + "grad_norm": 0.16306178425024134, + "kl": 0.0615234375, + "learning_rate": 9.407553166490956e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4326 + }, + { + "completion_length": 565.0, + "epoch": 0.6596036585365853, + "grad_norm": 0.5248920075156546, + "kl": 0.0758056640625, + "learning_rate": 9.400147751658026e-07, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4327 + }, + { + "completion_length": 1981.0000610351562, + "epoch": 0.6597560975609756, + "grad_norm": 0.12309672608992284, + "kl": 0.07763671875, + "learning_rate": 9.392743922388469e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4328 + }, + { + "completion_length": 1879.3333740234375, + "epoch": 0.6599085365853659, + "grad_norm": 1.871429727865409, + "kl": 0.08544921875, + "learning_rate": 9.385341680778638e-07, + "loss": 0.0034, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4329 + }, + { + "completion_length": 1818.0000915527344, + "epoch": 0.6600609756097561, + "grad_norm": 0.131996456723314, + "kl": 0.0711669921875, + "learning_rate": 9.377941028924434e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4330 + }, + { + "completion_length": 2400.1666870117188, + "epoch": 0.6602134146341463, + "grad_norm": 0.11035341548915746, + "kl": 0.0535888671875, + "learning_rate": 9.370541968921296e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4331 + }, + { + "completion_length": 1250.5000610351562, + "epoch": 0.6603658536585366, + "grad_norm": 1.3553087906648336, + "kl": 0.0762939453125, + "learning_rate": 9.363144502864233e-07, + "loss": 0.0031, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 4332 + }, + { + "completion_length": 1138.0, + "epoch": 0.6605182926829268, + "grad_norm": 0.11203140480727553, + "kl": 0.0621337890625, + "learning_rate": 9.355748632847783e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4333 + }, + { + "completion_length": 723.6666870117188, + "epoch": 0.660670731707317, + "grad_norm": 0.14404809977582006, + "kl": 0.085693359375, + "learning_rate": 9.348354360966044e-07, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4334 + }, + { + "completion_length": 1729.666748046875, + "epoch": 0.6608231707317073, + "grad_norm": 0.11417744599380661, + "kl": 0.0682373046875, + "learning_rate": 9.340961689312662e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4335 + }, + { + "completion_length": 2375.0, + "epoch": 0.6609756097560976, + "grad_norm": 0.08661503358329073, + "kl": 0.06787109375, + "learning_rate": 9.333570619980818e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4336 + }, + { + "completion_length": 2162.666748046875, + "epoch": 0.6611280487804878, + "grad_norm": 0.0929513254498445, + "kl": 0.0667724609375, + "learning_rate": 9.326181155063248e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4337 + }, + { + "completion_length": 1727.0000610351562, + "epoch": 0.661280487804878, + "grad_norm": 0.13805224661110782, + "kl": 0.0657958984375, + "learning_rate": 9.318793296652243e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4338 + }, + { + "completion_length": 1744.666748046875, + "epoch": 0.6614329268292682, + "grad_norm": 0.1442911020454222, + "kl": 0.072509765625, + "learning_rate": 9.311407046839622e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4339 + }, + { + "completion_length": 1493.5000610351562, + "epoch": 0.6615853658536586, + "grad_norm": 0.12331886331886381, + "kl": 0.07373046875, + "learning_rate": 9.304022407716754e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4340 + }, + { + "completion_length": 1979.6667175292969, + "epoch": 0.6617378048780488, + "grad_norm": 0.08747693127665233, + "kl": 0.052001953125, + "learning_rate": 9.296639381374561e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4341 + }, + { + "completion_length": 1869.0000610351562, + "epoch": 0.661890243902439, + "grad_norm": 0.11182630188017689, + "kl": 0.07568359375, + "learning_rate": 9.289257969903492e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4342 + }, + { + "completion_length": 1231.8333740234375, + "epoch": 0.6620426829268292, + "grad_norm": 0.13438526844190454, + "kl": 0.0888671875, + "learning_rate": 9.281878175393556e-07, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4343 + }, + { + "completion_length": 1545.6666870117188, + "epoch": 0.6621951219512195, + "grad_norm": 0.08722888807057033, + "kl": 0.0712890625, + "learning_rate": 9.27449999993429e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4344 + }, + { + "completion_length": 2135.0001220703125, + "epoch": 0.6623475609756098, + "grad_norm": 0.08940935834718026, + "kl": 0.0732421875, + "learning_rate": 9.267123445614783e-07, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4345 + }, + { + "completion_length": 2379.0, + "epoch": 0.6625, + "grad_norm": 1.9811097159279794, + "kl": 0.061279296875, + "learning_rate": 9.259748514523654e-07, + "loss": 0.0024, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 4346 + }, + { + "completion_length": 1625.1666870117188, + "epoch": 0.6626524390243902, + "grad_norm": 0.11731776233485344, + "kl": 0.06396484375, + "learning_rate": 9.252375208749074e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4347 + }, + { + "completion_length": 1427.0, + "epoch": 0.6628048780487805, + "grad_norm": 0.18020606775637718, + "kl": 0.080322265625, + "learning_rate": 9.245003530378752e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4348 + }, + { + "completion_length": 1803.6667175292969, + "epoch": 0.6629573170731707, + "grad_norm": 0.23176899797813894, + "kl": 0.0692138671875, + "learning_rate": 9.237633481499926e-07, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4349 + }, + { + "completion_length": 950.5000305175781, + "epoch": 0.663109756097561, + "grad_norm": 0.08706658646880239, + "kl": 0.0592041015625, + "learning_rate": 9.230265064199384e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4350 + }, + { + "completion_length": 2796.0, + "epoch": 0.6632621951219512, + "grad_norm": 2.22582136931006, + "kl": 0.0682373046875, + "learning_rate": 9.222898280563443e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4351 + }, + { + "completion_length": 2485.666748046875, + "epoch": 0.6634146341463415, + "grad_norm": 0.07120409389410143, + "kl": 0.06689453125, + "learning_rate": 9.215533132677969e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4352 + }, + { + "completion_length": 1816.5, + "epoch": 0.6635670731707317, + "grad_norm": 0.14993546619740075, + "kl": 0.08740234375, + "learning_rate": 9.20816962262835e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4353 + }, + { + "completion_length": 2729.8333740234375, + "epoch": 0.6637195121951219, + "grad_norm": 0.06741503856234861, + "kl": 0.056884765625, + "learning_rate": 9.200807752499522e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4354 + }, + { + "completion_length": 1205.1667175292969, + "epoch": 0.6638719512195121, + "grad_norm": 0.13982894971514856, + "kl": 0.070556640625, + "learning_rate": 9.193447524375956e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4355 + }, + { + "completion_length": 848.1666870117188, + "epoch": 0.6640243902439025, + "grad_norm": 0.10295568476815259, + "kl": 0.0693359375, + "learning_rate": 9.186088940341646e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4356 + }, + { + "completion_length": 1521.0000610351562, + "epoch": 0.6641768292682927, + "grad_norm": 0.09314477077047027, + "kl": 0.072998046875, + "learning_rate": 9.178732002480137e-07, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4357 + }, + { + "completion_length": 1633.1666870117188, + "epoch": 0.6643292682926829, + "grad_norm": 0.12078397322279395, + "kl": 0.072265625, + "learning_rate": 9.171376712874502e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4358 + }, + { + "completion_length": 631.5000305175781, + "epoch": 0.6644817073170731, + "grad_norm": 0.4644065099194289, + "kl": 0.0751953125, + "learning_rate": 9.164023073607338e-07, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4359 + }, + { + "completion_length": 2349.666748046875, + "epoch": 0.6646341463414634, + "grad_norm": 0.11651175011382796, + "kl": 0.0565185546875, + "learning_rate": 9.156671086760788e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4360 + }, + { + "completion_length": 1263.6667175292969, + "epoch": 0.6647865853658537, + "grad_norm": 0.17259406724817938, + "kl": 0.0830078125, + "learning_rate": 9.149320754416521e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4361 + }, + { + "completion_length": 1547.666748046875, + "epoch": 0.6649390243902439, + "grad_norm": 1.679969126801217, + "kl": 0.08203125, + "learning_rate": 9.141972078655732e-07, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4362 + }, + { + "completion_length": 1380.8334350585938, + "epoch": 0.6650914634146341, + "grad_norm": 0.11249084879840617, + "kl": 0.0633544921875, + "learning_rate": 9.134625061559164e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4363 + }, + { + "completion_length": 2837.8334350585938, + "epoch": 0.6652439024390244, + "grad_norm": 0.09945772594754533, + "kl": 0.0601806640625, + "learning_rate": 9.127279705207067e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4364 + }, + { + "completion_length": 1155.5, + "epoch": 0.6653963414634146, + "grad_norm": 0.10109823511357842, + "kl": 0.0653076171875, + "learning_rate": 9.11993601167924e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4365 + }, + { + "completion_length": 1852.5001220703125, + "epoch": 0.6655487804878049, + "grad_norm": 1.0004905561471988, + "kl": 0.0582275390625, + "learning_rate": 9.112593983055004e-07, + "loss": 0.0023, + "reward": 0.8333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/format_reward": 0.0, + "step": 4366 + }, + { + "completion_length": 2636.5001220703125, + "epoch": 0.6657012195121951, + "grad_norm": 0.07931020728754046, + "kl": 0.0545654296875, + "learning_rate": 9.105253621413206e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4367 + }, + { + "completion_length": 1158.1666870117188, + "epoch": 0.6658536585365854, + "grad_norm": 1.6040851978904753, + "kl": 0.10205078125, + "learning_rate": 9.097914928832228e-07, + "loss": 0.0041, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4368 + }, + { + "completion_length": 1326.8333435058594, + "epoch": 0.6660060975609756, + "grad_norm": 1.782280912074703, + "kl": 0.073486328125, + "learning_rate": 9.09057790738997e-07, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4369 + }, + { + "completion_length": 3196.8333740234375, + "epoch": 0.6661585365853658, + "grad_norm": 0.06511501016309922, + "kl": 0.06103515625, + "learning_rate": 9.083242559163869e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4370 + }, + { + "completion_length": 3438.3333740234375, + "epoch": 0.666310975609756, + "grad_norm": 0.07140838940243309, + "kl": 0.059326171875, + "learning_rate": 9.075908886230877e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4371 + }, + { + "completion_length": 2322.0000915527344, + "epoch": 0.6664634146341464, + "grad_norm": 0.08992860944473957, + "kl": 0.0546875, + "learning_rate": 9.068576890667484e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4372 + }, + { + "completion_length": 1619.5000610351562, + "epoch": 0.6666158536585366, + "grad_norm": 0.1333178309937437, + "kl": 0.078857421875, + "learning_rate": 9.061246574549698e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4373 + }, + { + "completion_length": 1336.1666870117188, + "epoch": 0.6667682926829268, + "grad_norm": 1.9464405933025086, + "kl": 0.0806884765625, + "learning_rate": 9.053917939953046e-07, + "loss": 0.0032, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 4374 + }, + { + "completion_length": 2326.3333740234375, + "epoch": 0.666920731707317, + "grad_norm": 0.0995723969503664, + "kl": 0.087646484375, + "learning_rate": 9.046590988952594e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4375 + }, + { + "completion_length": 3190.666748046875, + "epoch": 0.6670731707317074, + "grad_norm": 0.08352933589336761, + "kl": 0.04833984375, + "learning_rate": 9.039265723622923e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4376 + }, + { + "completion_length": 3504.0, + "epoch": 0.6672256097560976, + "grad_norm": 0.062088597900644464, + "kl": 0.063232421875, + "learning_rate": 9.031942146038127e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4377 + }, + { + "completion_length": 3348.3333740234375, + "epoch": 0.6673780487804878, + "grad_norm": 0.16633105940306037, + "kl": 0.0653076171875, + "learning_rate": 9.024620258271839e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4378 + }, + { + "completion_length": 2411.6666870117188, + "epoch": 0.667530487804878, + "grad_norm": 0.12740350746521356, + "kl": 0.062255859375, + "learning_rate": 9.017300062397209e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4379 + }, + { + "completion_length": 1826.5, + "epoch": 0.6676829268292683, + "grad_norm": 0.1347716818995712, + "kl": 0.0665283203125, + "learning_rate": 9.009981560486894e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4380 + }, + { + "completion_length": 2103.666748046875, + "epoch": 0.6678353658536585, + "grad_norm": 0.12639342885484037, + "kl": 0.065185546875, + "learning_rate": 9.002664754613093e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4381 + }, + { + "completion_length": 3027.0001220703125, + "epoch": 0.6679878048780488, + "grad_norm": 0.5048434404772992, + "kl": 0.0650634765625, + "learning_rate": 8.995349646847508e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4382 + }, + { + "completion_length": 1333.5, + "epoch": 0.668140243902439, + "grad_norm": 0.1169493763944997, + "kl": 0.065185546875, + "learning_rate": 8.988036239261369e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4383 + }, + { + "completion_length": 3101.3333740234375, + "epoch": 0.6682926829268293, + "grad_norm": 0.04944223606080732, + "kl": 0.0528564453125, + "learning_rate": 8.980724533925419e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4384 + }, + { + "completion_length": 837.3333740234375, + "epoch": 0.6684451219512195, + "grad_norm": 0.18429104104396576, + "kl": 0.07177734375, + "learning_rate": 8.973414532909922e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4385 + }, + { + "completion_length": 2807.0, + "epoch": 0.6685975609756097, + "grad_norm": 0.2608228237004438, + "kl": 0.053955078125, + "learning_rate": 8.966106238284668e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4386 + }, + { + "completion_length": 2241.5000610351562, + "epoch": 0.66875, + "grad_norm": 0.09896538945033903, + "kl": 0.0638427734375, + "learning_rate": 8.958799652118944e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4387 + }, + { + "completion_length": 2292.8333740234375, + "epoch": 0.6689024390243903, + "grad_norm": 0.08480328892763424, + "kl": 0.048583984375, + "learning_rate": 8.95149477648157e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4388 + }, + { + "completion_length": 1573.0000610351562, + "epoch": 0.6690548780487805, + "grad_norm": 1.4389432497747232, + "kl": 0.07177734375, + "learning_rate": 8.944191613440876e-07, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4389 + }, + { + "completion_length": 1994.3333740234375, + "epoch": 0.6692073170731707, + "grad_norm": 0.0888032472818099, + "kl": 0.0469970703125, + "learning_rate": 8.936890165064705e-07, + "loss": 0.0019, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 4390 + }, + { + "completion_length": 4096.0, + "epoch": 0.6693597560975609, + "grad_norm": 0.03667568697564204, + "kl": 0.0421142578125, + "learning_rate": 8.929590433420418e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4391 + }, + { + "completion_length": 3164.666748046875, + "epoch": 0.6695121951219513, + "grad_norm": 0.07168900875354858, + "kl": 0.05615234375, + "learning_rate": 8.922292420574888e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4392 + }, + { + "completion_length": 2956.5001220703125, + "epoch": 0.6696646341463415, + "grad_norm": 0.0597426544681193, + "kl": 0.05322265625, + "learning_rate": 8.914996128594498e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4393 + }, + { + "completion_length": 3298.166748046875, + "epoch": 0.6698170731707317, + "grad_norm": 0.04863101102546225, + "kl": 0.05029296875, + "learning_rate": 8.907701559545161e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4394 + }, + { + "completion_length": 1951.3333740234375, + "epoch": 0.6699695121951219, + "grad_norm": 0.09255579306121713, + "kl": 0.069091796875, + "learning_rate": 8.900408715492277e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4395 + }, + { + "completion_length": 2490.166748046875, + "epoch": 0.6701219512195122, + "grad_norm": 0.051764664916208146, + "kl": 0.05615234375, + "learning_rate": 8.893117598500773e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4396 + }, + { + "completion_length": 3290.33349609375, + "epoch": 0.6702743902439025, + "grad_norm": 0.07580206284434228, + "kl": 0.06494140625, + "learning_rate": 8.885828210635084e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4397 + }, + { + "completion_length": 1965.5000610351562, + "epoch": 0.6704268292682927, + "grad_norm": 0.15193044409629183, + "kl": 0.071044921875, + "learning_rate": 8.878540553959154e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4398 + }, + { + "completion_length": 1037.166748046875, + "epoch": 0.6705792682926829, + "grad_norm": 2.329586158435217, + "kl": 0.1142578125, + "learning_rate": 8.871254630536442e-07, + "loss": 0.0046, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4399 + }, + { + "completion_length": 2537.8333435058594, + "epoch": 0.6707317073170732, + "grad_norm": 0.0818726354660298, + "kl": 0.041015625, + "learning_rate": 8.863970442429902e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4400 + }, + { + "completion_length": 1386.3333740234375, + "epoch": 0.6708841463414634, + "grad_norm": 0.10714488587845734, + "kl": 0.0572509765625, + "learning_rate": 8.856687991702016e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4401 + }, + { + "completion_length": 1344.5000915527344, + "epoch": 0.6710365853658536, + "grad_norm": 2.4165180208172488, + "kl": 0.097900390625, + "learning_rate": 8.849407280414761e-07, + "loss": 0.0039, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4402 + }, + { + "completion_length": 1633.6666870117188, + "epoch": 0.6711890243902439, + "grad_norm": 1.9327159811896555, + "kl": 0.081787109375, + "learning_rate": 8.842128310629624e-07, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4403 + }, + { + "completion_length": 3552.5001220703125, + "epoch": 0.6713414634146342, + "grad_norm": 0.07340728097534625, + "kl": 0.0576171875, + "learning_rate": 8.834851084407602e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4404 + }, + { + "completion_length": 1176.6666870117188, + "epoch": 0.6714939024390244, + "grad_norm": 0.3087784461827594, + "kl": 0.10302734375, + "learning_rate": 8.827575603809197e-07, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4405 + }, + { + "completion_length": 1623.5000610351562, + "epoch": 0.6716463414634146, + "grad_norm": 0.0855304759271128, + "kl": 0.06396484375, + "learning_rate": 8.820301870894416e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4406 + }, + { + "completion_length": 1333.166748046875, + "epoch": 0.6717987804878048, + "grad_norm": 0.10773564770092725, + "kl": 0.0830078125, + "learning_rate": 8.813029887722768e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4407 + }, + { + "completion_length": 1030.5000610351562, + "epoch": 0.6719512195121952, + "grad_norm": 0.08052344423280414, + "kl": 0.0609130859375, + "learning_rate": 8.805759656353275e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4408 + }, + { + "completion_length": 2832.666748046875, + "epoch": 0.6721036585365854, + "grad_norm": 0.056009243007829224, + "kl": 0.0531005859375, + "learning_rate": 8.798491178844451e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4409 + }, + { + "completion_length": 2236.8333740234375, + "epoch": 0.6722560975609756, + "grad_norm": 0.1238787729316965, + "kl": 0.063232421875, + "learning_rate": 8.791224457254324e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4410 + }, + { + "completion_length": 2151.3334350585938, + "epoch": 0.6724085365853658, + "grad_norm": 0.08523944705468674, + "kl": 0.0706787109375, + "learning_rate": 8.78395949364042e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4411 + }, + { + "completion_length": 1336.3333740234375, + "epoch": 0.6725609756097561, + "grad_norm": 0.09801643169675407, + "kl": 0.0653076171875, + "learning_rate": 8.776696290059775e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4412 + }, + { + "completion_length": 2666.6666870117188, + "epoch": 0.6727134146341464, + "grad_norm": 0.8624482481733637, + "kl": 0.06103515625, + "learning_rate": 8.769434848568914e-07, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4413 + }, + { + "completion_length": 1417.6666870117188, + "epoch": 0.6728658536585366, + "grad_norm": 0.13092111381569924, + "kl": 0.091064453125, + "learning_rate": 8.762175171223869e-07, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4414 + }, + { + "completion_length": 1587.0000915527344, + "epoch": 0.6730182926829268, + "grad_norm": 0.09442093448378976, + "kl": 0.0780029296875, + "learning_rate": 8.754917260080169e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4415 + }, + { + "completion_length": 758.6666870117188, + "epoch": 0.6731707317073171, + "grad_norm": 0.12848268148018024, + "kl": 0.08056640625, + "learning_rate": 8.74766111719286e-07, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4416 + }, + { + "completion_length": 1240.5, + "epoch": 0.6733231707317073, + "grad_norm": 0.09501694225209845, + "kl": 0.060302734375, + "learning_rate": 8.740406744616458e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4417 + }, + { + "completion_length": 2445.83349609375, + "epoch": 0.6734756097560975, + "grad_norm": 1.3446093715327794, + "kl": 0.077880859375, + "learning_rate": 8.733154144405e-07, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4418 + }, + { + "completion_length": 801.3333435058594, + "epoch": 0.6736280487804878, + "grad_norm": 0.1504876115541734, + "kl": 0.09033203125, + "learning_rate": 8.725903318612015e-07, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4419 + }, + { + "completion_length": 1657.3333435058594, + "epoch": 0.6737804878048781, + "grad_norm": 1.0762084828526721, + "kl": 0.0682373046875, + "learning_rate": 8.718654269290535e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4420 + }, + { + "completion_length": 2074.166748046875, + "epoch": 0.6739329268292683, + "grad_norm": 0.08583207991614528, + "kl": 0.06591796875, + "learning_rate": 8.711406998493071e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4421 + }, + { + "completion_length": 1352.6666870117188, + "epoch": 0.6740853658536585, + "grad_norm": 0.11189430010272142, + "kl": 0.068115234375, + "learning_rate": 8.704161508271647e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4422 + }, + { + "completion_length": 1865.0001220703125, + "epoch": 0.6742378048780487, + "grad_norm": 0.08001107190235578, + "kl": 0.0621337890625, + "learning_rate": 8.696917800677783e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4423 + }, + { + "completion_length": 1465.666748046875, + "epoch": 0.6743902439024391, + "grad_norm": 0.6757111923880067, + "kl": 0.096923828125, + "learning_rate": 8.689675877762487e-07, + "loss": 0.0039, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4424 + }, + { + "completion_length": 1658.166748046875, + "epoch": 0.6745426829268293, + "grad_norm": 0.0761219498548838, + "kl": 0.0712890625, + "learning_rate": 8.682435741576271e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4425 + }, + { + "completion_length": 951.3333435058594, + "epoch": 0.6746951219512195, + "grad_norm": 0.08004210471213853, + "kl": 0.0511474609375, + "learning_rate": 8.675197394169122e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4426 + }, + { + "completion_length": 1083.6667175292969, + "epoch": 0.6748475609756097, + "grad_norm": 0.12756290074078255, + "kl": 0.0740966796875, + "learning_rate": 8.66796083759054e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4427 + }, + { + "completion_length": 879.1666870117188, + "epoch": 0.675, + "grad_norm": 0.10149188972583753, + "kl": 0.065185546875, + "learning_rate": 8.660726073889511e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4428 + }, + { + "completion_length": 1261.0, + "epoch": 0.6751524390243903, + "grad_norm": 0.08516709607990093, + "kl": 0.0537109375, + "learning_rate": 8.65349310511451e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4429 + }, + { + "completion_length": 1769.8333740234375, + "epoch": 0.6753048780487805, + "grad_norm": 0.07698712479755591, + "kl": 0.06787109375, + "learning_rate": 8.646261933313513e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4430 + }, + { + "completion_length": 1314.0000610351562, + "epoch": 0.6754573170731707, + "grad_norm": 0.23071497263715507, + "kl": 0.078369140625, + "learning_rate": 8.639032560533979e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4431 + }, + { + "completion_length": 1207.0000305175781, + "epoch": 0.675609756097561, + "grad_norm": 0.09724971128154314, + "kl": 0.0616455078125, + "learning_rate": 8.631804988822859e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4432 + }, + { + "completion_length": 1784.1666870117188, + "epoch": 0.6757621951219512, + "grad_norm": 0.10714780528187859, + "kl": 0.0677490234375, + "learning_rate": 8.6245792202266e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4433 + }, + { + "completion_length": 1710.0, + "epoch": 0.6759146341463415, + "grad_norm": 0.10481910849436515, + "kl": 0.064697265625, + "learning_rate": 8.617355256791123e-07, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 4434 + }, + { + "completion_length": 1009.0, + "epoch": 0.6760670731707317, + "grad_norm": 0.22261162571273177, + "kl": 0.082763671875, + "learning_rate": 8.610133100561855e-07, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4435 + }, + { + "completion_length": 687.0000305175781, + "epoch": 0.676219512195122, + "grad_norm": 1.8945121490803376, + "kl": 0.08935546875, + "learning_rate": 8.602912753583704e-07, + "loss": 0.0036, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 4436 + }, + { + "completion_length": 1543.3333740234375, + "epoch": 0.6763719512195122, + "grad_norm": 0.08925803775662244, + "kl": 0.076416015625, + "learning_rate": 8.595694217901068e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4437 + }, + { + "completion_length": 1664.166748046875, + "epoch": 0.6765243902439024, + "grad_norm": 1.3018934711463157, + "kl": 0.07470703125, + "learning_rate": 8.588477495557833e-07, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4438 + }, + { + "completion_length": 1805.3334350585938, + "epoch": 0.6766768292682926, + "grad_norm": 0.07492249666321509, + "kl": 0.063720703125, + "learning_rate": 8.581262588597361e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4439 + }, + { + "completion_length": 2345.5000610351562, + "epoch": 0.676829268292683, + "grad_norm": 0.7756661757250652, + "kl": 0.050537109375, + "learning_rate": 8.574049499062509e-07, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4440 + }, + { + "completion_length": 848.1666870117188, + "epoch": 0.6769817073170732, + "grad_norm": 0.11232108777402985, + "kl": 0.06396484375, + "learning_rate": 8.566838228995624e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4441 + }, + { + "completion_length": 2575.166748046875, + "epoch": 0.6771341463414634, + "grad_norm": 0.0810569347812803, + "kl": 0.0711669921875, + "learning_rate": 8.55962878043853e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4442 + }, + { + "completion_length": 1310.8333435058594, + "epoch": 0.6772865853658536, + "grad_norm": 0.11989443525466609, + "kl": 0.073486328125, + "learning_rate": 8.552421155432543e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4443 + }, + { + "completion_length": 2191.5000610351562, + "epoch": 0.677439024390244, + "grad_norm": 0.8913090698049027, + "kl": 0.0594482421875, + "learning_rate": 8.545215356018445e-07, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4444 + }, + { + "completion_length": 1339.8333740234375, + "epoch": 0.6775914634146342, + "grad_norm": 1.3928262214699865, + "kl": 0.088623046875, + "learning_rate": 8.538011384236521e-07, + "loss": 0.0035, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4445 + }, + { + "completion_length": 2398.166748046875, + "epoch": 0.6777439024390244, + "grad_norm": 0.07418443683046423, + "kl": 0.060302734375, + "learning_rate": 8.530809242126529e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4446 + }, + { + "completion_length": 1975.5001220703125, + "epoch": 0.6778963414634146, + "grad_norm": 0.08282372322610594, + "kl": 0.076171875, + "learning_rate": 8.523608931727712e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4447 + }, + { + "completion_length": 1525.3333435058594, + "epoch": 0.6780487804878049, + "grad_norm": 0.09503029126833527, + "kl": 0.068359375, + "learning_rate": 8.516410455078793e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4448 + }, + { + "completion_length": 3237.5001220703125, + "epoch": 0.6782012195121951, + "grad_norm": 0.04689356432106945, + "kl": 0.0599365234375, + "learning_rate": 8.509213814217978e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4449 + }, + { + "completion_length": 1903.8333740234375, + "epoch": 0.6783536585365854, + "grad_norm": 0.07936390591723361, + "kl": 0.06494140625, + "learning_rate": 8.502019011182948e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4450 + }, + { + "completion_length": 1459.0000610351562, + "epoch": 0.6785060975609756, + "grad_norm": 1.6980739937739535, + "kl": 0.0772705078125, + "learning_rate": 8.494826048010872e-07, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4451 + }, + { + "completion_length": 2122.3333740234375, + "epoch": 0.6786585365853659, + "grad_norm": 0.20126228353869213, + "kl": 0.0699462890625, + "learning_rate": 8.487634926738385e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4452 + }, + { + "completion_length": 1647.5000610351562, + "epoch": 0.6788109756097561, + "grad_norm": 0.2483680802728623, + "kl": 0.100830078125, + "learning_rate": 8.48044564940161e-07, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4453 + }, + { + "completion_length": 1913.0000610351562, + "epoch": 0.6789634146341463, + "grad_norm": 0.07401676148208827, + "kl": 0.05322265625, + "learning_rate": 8.47325821803615e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4454 + }, + { + "completion_length": 2804.0001220703125, + "epoch": 0.6791158536585366, + "grad_norm": 1.3376132675049666, + "kl": 0.048095703125, + "learning_rate": 8.466072634677078e-07, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4455 + }, + { + "completion_length": 2271.5000915527344, + "epoch": 0.6792682926829269, + "grad_norm": 0.12678137421076133, + "kl": 0.0609130859375, + "learning_rate": 8.458888901358958e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4456 + }, + { + "completion_length": 1220.3333740234375, + "epoch": 0.6794207317073171, + "grad_norm": 0.22071783387445704, + "kl": 0.086669921875, + "learning_rate": 8.451707020115802e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4457 + }, + { + "completion_length": 3995.5, + "epoch": 0.6795731707317073, + "grad_norm": 0.04195048494599706, + "kl": 0.0504150390625, + "learning_rate": 8.444526992981123e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4458 + }, + { + "completion_length": 2185.166748046875, + "epoch": 0.6797256097560975, + "grad_norm": 0.062167808008818035, + "kl": 0.069091796875, + "learning_rate": 8.437348821987901e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4459 + }, + { + "completion_length": 1791.5000610351562, + "epoch": 0.6798780487804879, + "grad_norm": 0.13249709912149002, + "kl": 0.060546875, + "learning_rate": 8.430172509168594e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4460 + }, + { + "completion_length": 1982.0000915527344, + "epoch": 0.6800304878048781, + "grad_norm": 0.0792661376114855, + "kl": 0.068359375, + "learning_rate": 8.422998056555132e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4461 + }, + { + "completion_length": 1337.8333740234375, + "epoch": 0.6801829268292683, + "grad_norm": 0.08727471964791349, + "kl": 0.0635986328125, + "learning_rate": 8.415825466178906e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4462 + }, + { + "completion_length": 2102.666717529297, + "epoch": 0.6803353658536585, + "grad_norm": 0.0625402565981471, + "kl": 0.0513916015625, + "learning_rate": 8.408654740070797e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4463 + }, + { + "completion_length": 1527.666748046875, + "epoch": 0.6804878048780488, + "grad_norm": 0.18082349363754993, + "kl": 0.089111328125, + "learning_rate": 8.401485880261151e-07, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4464 + }, + { + "completion_length": 1970.1666870117188, + "epoch": 0.680640243902439, + "grad_norm": 0.07533921894364322, + "kl": 0.0618896484375, + "learning_rate": 8.394318888779794e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4465 + }, + { + "completion_length": 2059.0000610351562, + "epoch": 0.6807926829268293, + "grad_norm": 0.08478147698901108, + "kl": 0.0621337890625, + "learning_rate": 8.387153767655994e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4466 + }, + { + "completion_length": 2225.666748046875, + "epoch": 0.6809451219512195, + "grad_norm": 0.08490268936447927, + "kl": 0.0535888671875, + "learning_rate": 8.379990518918529e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4467 + }, + { + "completion_length": 846.0000305175781, + "epoch": 0.6810975609756098, + "grad_norm": 0.17276408768386123, + "kl": 0.05615234375, + "learning_rate": 8.372829144595623e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4468 + }, + { + "completion_length": 2418.5, + "epoch": 0.68125, + "grad_norm": 0.09646772053175677, + "kl": 0.0538330078125, + "learning_rate": 8.365669646714984e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4469 + }, + { + "completion_length": 1127.1666870117188, + "epoch": 0.6814024390243902, + "grad_norm": 0.19750395103719343, + "kl": 0.081298828125, + "learning_rate": 8.358512027303762e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4470 + }, + { + "completion_length": 3086.0, + "epoch": 0.6815548780487805, + "grad_norm": 0.07884133830957507, + "kl": 0.0604248046875, + "learning_rate": 8.351356288388602e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4471 + }, + { + "completion_length": 1391.0, + "epoch": 0.6817073170731708, + "grad_norm": 0.08758659089981262, + "kl": 0.0555419921875, + "learning_rate": 8.344202431995604e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4472 + }, + { + "completion_length": 2041.8333435058594, + "epoch": 0.681859756097561, + "grad_norm": 0.07740055738448154, + "kl": 0.065185546875, + "learning_rate": 8.337050460150341e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4473 + }, + { + "completion_length": 3031.83349609375, + "epoch": 0.6820121951219512, + "grad_norm": 0.06820136948319021, + "kl": 0.0550537109375, + "learning_rate": 8.329900374877854e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4474 + }, + { + "completion_length": 1920.8334350585938, + "epoch": 0.6821646341463414, + "grad_norm": 0.10594326669945649, + "kl": 0.06884765625, + "learning_rate": 8.322752178202633e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4475 + }, + { + "completion_length": 1504.6666870117188, + "epoch": 0.6823170731707318, + "grad_norm": 0.1340182639208057, + "kl": 0.07861328125, + "learning_rate": 8.315605872148653e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4476 + }, + { + "completion_length": 2052.6666870117188, + "epoch": 0.682469512195122, + "grad_norm": 0.06779465407601018, + "kl": 0.062255859375, + "learning_rate": 8.308461458739344e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4477 + }, + { + "completion_length": 2558.3333740234375, + "epoch": 0.6826219512195122, + "grad_norm": 1.1908512149778796, + "kl": 0.0657958984375, + "learning_rate": 8.301318939997604e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4478 + }, + { + "completion_length": 2456.5, + "epoch": 0.6827743902439024, + "grad_norm": 0.12984321914342625, + "kl": 0.0535888671875, + "learning_rate": 8.294178317945798e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4479 + }, + { + "completion_length": 1590.0000610351562, + "epoch": 0.6829268292682927, + "grad_norm": 0.08111824858420416, + "kl": 0.069091796875, + "learning_rate": 8.287039594605737e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4480 + }, + { + "completion_length": 1803.8333740234375, + "epoch": 0.683079268292683, + "grad_norm": 0.16490749185637513, + "kl": 0.0723876953125, + "learning_rate": 8.279902771998714e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4481 + }, + { + "completion_length": 919.6666870117188, + "epoch": 0.6832317073170732, + "grad_norm": 0.10661503943776938, + "kl": 0.0616455078125, + "learning_rate": 8.272767852145472e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4482 + }, + { + "completion_length": 1913.8334350585938, + "epoch": 0.6833841463414634, + "grad_norm": 2.105714632017786, + "kl": 0.057373046875, + "learning_rate": 8.265634837066227e-07, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4483 + }, + { + "completion_length": 3010.8333740234375, + "epoch": 0.6835365853658537, + "grad_norm": 0.07447174360034285, + "kl": 0.0655517578125, + "learning_rate": 8.258503728780638e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4484 + }, + { + "completion_length": 1092.1666870117188, + "epoch": 0.6836890243902439, + "grad_norm": 0.18029404615873107, + "kl": 0.0679931640625, + "learning_rate": 8.25137452930783e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4485 + }, + { + "completion_length": 947.6666870117188, + "epoch": 0.6838414634146341, + "grad_norm": 0.09675950054942543, + "kl": 0.052490234375, + "learning_rate": 8.244247240666406e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4486 + }, + { + "completion_length": 1482.5, + "epoch": 0.6839939024390244, + "grad_norm": 0.08166916118291465, + "kl": 0.06298828125, + "learning_rate": 8.237121864874414e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4487 + }, + { + "completion_length": 1579.6666870117188, + "epoch": 0.6841463414634147, + "grad_norm": 0.15848672891290055, + "kl": 0.0543212890625, + "learning_rate": 8.229998403949348e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4488 + }, + { + "completion_length": 1968.3333740234375, + "epoch": 0.6842987804878049, + "grad_norm": 0.1460492256154788, + "kl": 0.0582275390625, + "learning_rate": 8.222876859908177e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4489 + }, + { + "completion_length": 1132.0000305175781, + "epoch": 0.6844512195121951, + "grad_norm": 2.192768909159939, + "kl": 0.078125, + "learning_rate": 8.215757234767322e-07, + "loss": 0.0031, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4490 + }, + { + "completion_length": 2351.5000915527344, + "epoch": 0.6846036585365853, + "grad_norm": 0.08381013848292293, + "kl": 0.063232421875, + "learning_rate": 8.208639530542659e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4491 + }, + { + "completion_length": 2244.8334350585938, + "epoch": 0.6847560975609757, + "grad_norm": 0.06747237684901804, + "kl": 0.056884765625, + "learning_rate": 8.20152374924953e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4492 + }, + { + "completion_length": 2146.0, + "epoch": 0.6849085365853659, + "grad_norm": 1.1757331754772742, + "kl": 0.0609130859375, + "learning_rate": 8.19440989290271e-07, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4493 + }, + { + "completion_length": 3067.5001220703125, + "epoch": 0.6850609756097561, + "grad_norm": 0.09297535967547332, + "kl": 0.056396484375, + "learning_rate": 8.187297963516452e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4494 + }, + { + "completion_length": 925.3333435058594, + "epoch": 0.6852134146341463, + "grad_norm": 0.10862875985609394, + "kl": 0.06982421875, + "learning_rate": 8.180187963104453e-07, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4495 + }, + { + "completion_length": 1537.3334350585938, + "epoch": 0.6853658536585366, + "grad_norm": 1.428322115276935, + "kl": 0.0616455078125, + "learning_rate": 8.173079893679873e-07, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4496 + }, + { + "completion_length": 2444.0, + "epoch": 0.6855182926829269, + "grad_norm": 0.16028934707614056, + "kl": 0.0570068359375, + "learning_rate": 8.165973757255303e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4497 + }, + { + "completion_length": 1159.1666870117188, + "epoch": 0.6856707317073171, + "grad_norm": 1.5072375722415698, + "kl": 0.061767578125, + "learning_rate": 8.15886955584281e-07, + "loss": 0.0025, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4498 + }, + { + "completion_length": 1643.3333435058594, + "epoch": 0.6858231707317073, + "grad_norm": 0.0854850020713019, + "kl": 0.076904296875, + "learning_rate": 8.151767291453903e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4499 + }, + { + "completion_length": 1332.6666870117188, + "epoch": 0.6859756097560976, + "grad_norm": 0.08274285435704341, + "kl": 0.0552978515625, + "learning_rate": 8.144666966099543e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4500 + }, + { + "completion_length": 1019.3333740234375, + "epoch": 0.6861280487804878, + "grad_norm": 2.2820828858406585, + "kl": 0.09814453125, + "learning_rate": 8.137568581790154e-07, + "loss": 0.0039, + "reward": 0.6666666865348816, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 4501 + }, + { + "completion_length": 775.6666870117188, + "epoch": 0.686280487804878, + "grad_norm": 0.10268006036974191, + "kl": 0.054443359375, + "learning_rate": 8.13047214053558e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4502 + }, + { + "completion_length": 3755.8333740234375, + "epoch": 0.6864329268292683, + "grad_norm": 0.03822081880913982, + "kl": 0.043701171875, + "learning_rate": 8.123377644345141e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4503 + }, + { + "completion_length": 2726.666748046875, + "epoch": 0.6865853658536586, + "grad_norm": 0.057023809760364294, + "kl": 0.064697265625, + "learning_rate": 8.116285095227604e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4504 + }, + { + "completion_length": 1700.0000610351562, + "epoch": 0.6867378048780488, + "grad_norm": 0.13589855101785706, + "kl": 0.0537109375, + "learning_rate": 8.109194495191189e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4505 + }, + { + "completion_length": 1401.166748046875, + "epoch": 0.686890243902439, + "grad_norm": 0.08706298158258519, + "kl": 0.0533447265625, + "learning_rate": 8.102105846243539e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4506 + }, + { + "completion_length": 1925.5, + "epoch": 0.6870426829268292, + "grad_norm": 0.061931085271747906, + "kl": 0.0618896484375, + "learning_rate": 8.095019150391767e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4507 + }, + { + "completion_length": 2168.8333435058594, + "epoch": 0.6871951219512196, + "grad_norm": 0.07176215742206442, + "kl": 0.0560302734375, + "learning_rate": 8.087934409642426e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4508 + }, + { + "completion_length": 2587.5, + "epoch": 0.6873475609756098, + "grad_norm": 0.07092810143242363, + "kl": 0.057373046875, + "learning_rate": 8.080851626001519e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4509 + }, + { + "completion_length": 1661.1666870117188, + "epoch": 0.6875, + "grad_norm": 0.0707954405233185, + "kl": 0.0576171875, + "learning_rate": 8.073770801474494e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4510 + }, + { + "completion_length": 3653.3333740234375, + "epoch": 0.6876524390243902, + "grad_norm": 0.046399967358792614, + "kl": 0.050048828125, + "learning_rate": 8.066691938066233e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4511 + }, + { + "completion_length": 1416.3333740234375, + "epoch": 0.6878048780487804, + "grad_norm": 0.10404298062993986, + "kl": 0.0869140625, + "learning_rate": 8.05961503778108e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4512 + }, + { + "completion_length": 2694.8333740234375, + "epoch": 0.6879573170731708, + "grad_norm": 0.12369171357815455, + "kl": 0.0572509765625, + "learning_rate": 8.052540102622814e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4513 + }, + { + "completion_length": 2299.166748046875, + "epoch": 0.688109756097561, + "grad_norm": 1.4077693435625662, + "kl": 0.08203125, + "learning_rate": 8.045467134594663e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4514 + }, + { + "completion_length": 2082.0000610351562, + "epoch": 0.6882621951219512, + "grad_norm": 0.05124579495635754, + "kl": 0.0517578125, + "learning_rate": 8.038396135699284e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4515 + }, + { + "completion_length": 2523.3334350585938, + "epoch": 0.6884146341463414, + "grad_norm": 0.10175686061185828, + "kl": 0.076171875, + "learning_rate": 8.03132710793879e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4516 + }, + { + "completion_length": 1489.1666870117188, + "epoch": 0.6885670731707317, + "grad_norm": 0.1662544730299754, + "kl": 0.06689453125, + "learning_rate": 8.024260053314738e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4517 + }, + { + "completion_length": 1857.8334350585938, + "epoch": 0.688719512195122, + "grad_norm": 0.09936426306675306, + "kl": 0.06103515625, + "learning_rate": 8.017194973828114e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4518 + }, + { + "completion_length": 862.3333740234375, + "epoch": 0.6888719512195122, + "grad_norm": 0.11217283817767895, + "kl": 0.0615234375, + "learning_rate": 8.01013187147936e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4519 + }, + { + "completion_length": 2934.166748046875, + "epoch": 0.6890243902439024, + "grad_norm": 0.8472015880417706, + "kl": 0.05322265625, + "learning_rate": 8.003070748268339e-07, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4520 + }, + { + "completion_length": 1663.5, + "epoch": 0.6891768292682927, + "grad_norm": 0.07268670170343415, + "kl": 0.0511474609375, + "learning_rate": 7.996011606194368e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4521 + }, + { + "completion_length": 2462.5001220703125, + "epoch": 0.6893292682926829, + "grad_norm": 1.1462417559672542, + "kl": 0.0657958984375, + "learning_rate": 7.988954447256196e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4522 + }, + { + "completion_length": 1377.8333740234375, + "epoch": 0.6894817073170731, + "grad_norm": 0.13262518886418373, + "kl": 0.081298828125, + "learning_rate": 7.981899273452028e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4523 + }, + { + "completion_length": 2119.0000915527344, + "epoch": 0.6896341463414634, + "grad_norm": 1.9161355890012062, + "kl": 0.075439453125, + "learning_rate": 7.974846086779475e-07, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4524 + }, + { + "completion_length": 713.3333435058594, + "epoch": 0.6897865853658537, + "grad_norm": 0.09572440756480823, + "kl": 0.0537109375, + "learning_rate": 7.967794889235614e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4525 + }, + { + "completion_length": 858.3333740234375, + "epoch": 0.6899390243902439, + "grad_norm": 0.09022416008108514, + "kl": 0.049072265625, + "learning_rate": 7.960745682816943e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4526 + }, + { + "completion_length": 3093.5001220703125, + "epoch": 0.6900914634146341, + "grad_norm": 0.06188479808356125, + "kl": 0.0625, + "learning_rate": 7.953698469519406e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4527 + }, + { + "completion_length": 1730.6666870117188, + "epoch": 0.6902439024390243, + "grad_norm": 0.08490635163702587, + "kl": 0.056884765625, + "learning_rate": 7.94665325133837e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4528 + }, + { + "completion_length": 2433.8333740234375, + "epoch": 0.6903963414634147, + "grad_norm": 0.10881984345845633, + "kl": 0.0716552734375, + "learning_rate": 7.939610030268649e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4529 + }, + { + "completion_length": 2671.3333740234375, + "epoch": 0.6905487804878049, + "grad_norm": 0.08322995672512543, + "kl": 0.0595703125, + "learning_rate": 7.932568808304485e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4530 + }, + { + "completion_length": 1234.6666870117188, + "epoch": 0.6907012195121951, + "grad_norm": 0.09393195854030918, + "kl": 0.06689453125, + "learning_rate": 7.92552958743956e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4531 + }, + { + "completion_length": 2199.0000915527344, + "epoch": 0.6908536585365853, + "grad_norm": 0.07568726498346263, + "kl": 0.0516357421875, + "learning_rate": 7.918492369666989e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4532 + }, + { + "completion_length": 2899.5, + "epoch": 0.6910060975609756, + "grad_norm": 0.07188105513070808, + "kl": 0.0693359375, + "learning_rate": 7.911457156979305e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4533 + }, + { + "completion_length": 1352.1666870117188, + "epoch": 0.6911585365853659, + "grad_norm": 1.454265403227855, + "kl": 0.0621337890625, + "learning_rate": 7.904423951368492e-07, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4534 + }, + { + "completion_length": 2207.33349609375, + "epoch": 0.6913109756097561, + "grad_norm": 0.05641181665802854, + "kl": 0.0550537109375, + "learning_rate": 7.897392754825956e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4535 + }, + { + "completion_length": 2899.8333740234375, + "epoch": 0.6914634146341463, + "grad_norm": 0.0546311467040937, + "kl": 0.0565185546875, + "learning_rate": 7.890363569342539e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4536 + }, + { + "completion_length": 2187.1666870117188, + "epoch": 0.6916158536585366, + "grad_norm": 1.718881061343471, + "kl": 0.0753173828125, + "learning_rate": 7.883336396908517e-07, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4537 + }, + { + "completion_length": 1420.5000915527344, + "epoch": 0.6917682926829268, + "grad_norm": 0.12404370048570809, + "kl": 0.07275390625, + "learning_rate": 7.876311239513577e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4538 + }, + { + "completion_length": 1941.0, + "epoch": 0.691920731707317, + "grad_norm": 41.93728209089416, + "kl": 0.442626953125, + "learning_rate": 7.869288099146857e-07, + "loss": 0.0177, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4539 + }, + { + "completion_length": 2087.0, + "epoch": 0.6920731707317073, + "grad_norm": 0.07916727616078582, + "kl": 0.0517578125, + "learning_rate": 7.862266977796907e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4540 + }, + { + "completion_length": 2960.0, + "epoch": 0.6922256097560976, + "grad_norm": 0.054069791935013364, + "kl": 0.0513916015625, + "learning_rate": 7.855247877451733e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4541 + }, + { + "completion_length": 1472.3333740234375, + "epoch": 0.6923780487804878, + "grad_norm": 0.2112618316561713, + "kl": 0.079345703125, + "learning_rate": 7.84823080009873e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4542 + }, + { + "completion_length": 1633.3333435058594, + "epoch": 0.692530487804878, + "grad_norm": 1.0395323227517954, + "kl": 0.059326171875, + "learning_rate": 7.841215747724748e-07, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4543 + }, + { + "completion_length": 1745.0000915527344, + "epoch": 0.6926829268292682, + "grad_norm": 0.2104803069716952, + "kl": 0.087158203125, + "learning_rate": 7.834202722316054e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4544 + }, + { + "completion_length": 1329.6667175292969, + "epoch": 0.6928353658536586, + "grad_norm": 0.12009041171358766, + "kl": 0.119140625, + "learning_rate": 7.82719172585835e-07, + "loss": 0.0048, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4545 + }, + { + "completion_length": 955.5000610351562, + "epoch": 0.6929878048780488, + "grad_norm": 0.09475758058988196, + "kl": 0.0594482421875, + "learning_rate": 7.820182760336744e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4546 + }, + { + "completion_length": 2054.666748046875, + "epoch": 0.693140243902439, + "grad_norm": 0.07660706979866409, + "kl": 0.074951171875, + "learning_rate": 7.813175827735786e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4547 + }, + { + "completion_length": 2238.0, + "epoch": 0.6932926829268292, + "grad_norm": 0.08771246010328784, + "kl": 0.066650390625, + "learning_rate": 7.806170930039446e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4548 + }, + { + "completion_length": 2947.0001220703125, + "epoch": 0.6934451219512195, + "grad_norm": 0.058988363965529124, + "kl": 0.07177734375, + "learning_rate": 7.799168069231117e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4549 + }, + { + "completion_length": 1002.6666870117188, + "epoch": 0.6935975609756098, + "grad_norm": 0.10348628394225608, + "kl": 0.070068359375, + "learning_rate": 7.792167247293623e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4550 + }, + { + "completion_length": 1150.1666870117188, + "epoch": 0.69375, + "grad_norm": 0.1279849008472779, + "kl": 0.12744140625, + "learning_rate": 7.785168466209188e-07, + "loss": 0.0051, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4551 + }, + { + "completion_length": 927.8333740234375, + "epoch": 0.6939024390243902, + "grad_norm": 0.16225663680199587, + "kl": 0.061279296875, + "learning_rate": 7.778171727959482e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4552 + }, + { + "completion_length": 1606.3333740234375, + "epoch": 0.6940548780487805, + "grad_norm": 0.10537559099322809, + "kl": 0.059326171875, + "learning_rate": 7.771177034525589e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4553 + }, + { + "completion_length": 1698.1666870117188, + "epoch": 0.6942073170731707, + "grad_norm": 0.11505234787575294, + "kl": 0.0687255859375, + "learning_rate": 7.764184387888009e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4554 + }, + { + "completion_length": 1654.8333740234375, + "epoch": 0.694359756097561, + "grad_norm": 0.12374693228300421, + "kl": 0.0626220703125, + "learning_rate": 7.757193790026676e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4555 + }, + { + "completion_length": 1613.3333435058594, + "epoch": 0.6945121951219512, + "grad_norm": 0.09124208782414796, + "kl": 0.0673828125, + "learning_rate": 7.750205242920921e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4556 + }, + { + "completion_length": 1229.8333740234375, + "epoch": 0.6946646341463415, + "grad_norm": 1.3553864173680819, + "kl": 0.0908203125, + "learning_rate": 7.743218748549514e-07, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4557 + }, + { + "completion_length": 924.8333740234375, + "epoch": 0.6948170731707317, + "grad_norm": 0.09998653797129506, + "kl": 0.046630859375, + "learning_rate": 7.736234308890637e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4558 + }, + { + "completion_length": 1431.8333740234375, + "epoch": 0.6949695121951219, + "grad_norm": 0.09422911218907055, + "kl": 0.0638427734375, + "learning_rate": 7.729251925921892e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4559 + }, + { + "completion_length": 1690.6667175292969, + "epoch": 0.6951219512195121, + "grad_norm": 0.08586957464094011, + "kl": 0.0633544921875, + "learning_rate": 7.722271601620293e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4560 + }, + { + "completion_length": 1331.1667175292969, + "epoch": 0.6952743902439025, + "grad_norm": 0.19082237668563184, + "kl": 0.080810546875, + "learning_rate": 7.715293337962284e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4561 + }, + { + "completion_length": 1015.6666870117188, + "epoch": 0.6954268292682927, + "grad_norm": 0.10028683349629033, + "kl": 0.066650390625, + "learning_rate": 7.708317136923708e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4562 + }, + { + "completion_length": 1411.1666870117188, + "epoch": 0.6955792682926829, + "grad_norm": 0.07875913314193154, + "kl": 0.0640869140625, + "learning_rate": 7.701343000479844e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4563 + }, + { + "completion_length": 1530.0000915527344, + "epoch": 0.6957317073170731, + "grad_norm": 0.07160896020682202, + "kl": 0.05810546875, + "learning_rate": 7.694370930605362e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4564 + }, + { + "completion_length": 2162.8333740234375, + "epoch": 0.6958841463414634, + "grad_norm": 0.09263811739416342, + "kl": 0.0584716796875, + "learning_rate": 7.687400929274366e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4565 + }, + { + "completion_length": 1651.5000610351562, + "epoch": 0.6960365853658537, + "grad_norm": 0.07185577185310062, + "kl": 0.072021484375, + "learning_rate": 7.680432998460371e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4566 + }, + { + "completion_length": 1729.3333435058594, + "epoch": 0.6961890243902439, + "grad_norm": 1.1612793351718314, + "kl": 0.072998046875, + "learning_rate": 7.673467140136299e-07, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4567 + }, + { + "completion_length": 1733.666748046875, + "epoch": 0.6963414634146341, + "grad_norm": 0.3858633972415054, + "kl": 0.11474609375, + "learning_rate": 7.6665033562745e-07, + "loss": 0.0046, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4568 + }, + { + "completion_length": 2313.666748046875, + "epoch": 0.6964939024390244, + "grad_norm": 0.08812174563805843, + "kl": 0.070556640625, + "learning_rate": 7.659541648846711e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4569 + }, + { + "completion_length": 1532.6666870117188, + "epoch": 0.6966463414634146, + "grad_norm": 1.1825983035914758, + "kl": 0.075927734375, + "learning_rate": 7.652582019824103e-07, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4570 + }, + { + "completion_length": 1374.166748046875, + "epoch": 0.6967987804878049, + "grad_norm": 0.07973794814598026, + "kl": 0.0810546875, + "learning_rate": 7.645624471177253e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4571 + }, + { + "completion_length": 869.6666870117188, + "epoch": 0.6969512195121951, + "grad_norm": 0.6041652273330894, + "kl": 0.0830078125, + "learning_rate": 7.638669004876145e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4572 + }, + { + "completion_length": 2229.0, + "epoch": 0.6971036585365854, + "grad_norm": 0.07288757849021289, + "kl": 0.0704345703125, + "learning_rate": 7.631715622890186e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4573 + }, + { + "completion_length": 1361.166748046875, + "epoch": 0.6972560975609756, + "grad_norm": 0.08790491740081804, + "kl": 0.0670166015625, + "learning_rate": 7.624764327188167e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4574 + }, + { + "completion_length": 1297.5000610351562, + "epoch": 0.6974085365853658, + "grad_norm": 0.13406479327744067, + "kl": 0.099853515625, + "learning_rate": 7.617815119738312e-07, + "loss": 0.004, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4575 + }, + { + "completion_length": 1098.5, + "epoch": 0.697560975609756, + "grad_norm": 0.1010856009283622, + "kl": 0.062744140625, + "learning_rate": 7.610868002508248e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4576 + }, + { + "completion_length": 1491.6666870117188, + "epoch": 0.6977134146341464, + "grad_norm": 0.06705942563936712, + "kl": 0.064208984375, + "learning_rate": 7.603922977465002e-07, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 4577 + }, + { + "completion_length": 1020.8333740234375, + "epoch": 0.6978658536585366, + "grad_norm": 1.3495072496914982, + "kl": 0.0760498046875, + "learning_rate": 7.596980046575021e-07, + "loss": 0.0031, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 4578 + }, + { + "completion_length": 833.0, + "epoch": 0.6980182926829268, + "grad_norm": 0.10331608237615612, + "kl": 0.057861328125, + "learning_rate": 7.59003921180415e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4579 + }, + { + "completion_length": 869.0000610351562, + "epoch": 0.698170731707317, + "grad_norm": 0.10252306588051631, + "kl": 0.072021484375, + "learning_rate": 7.583100475117643e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4580 + }, + { + "completion_length": 970.6666870117188, + "epoch": 0.6983231707317074, + "grad_norm": 0.22749390084645413, + "kl": 0.0587158203125, + "learning_rate": 7.576163838480168e-07, + "loss": 0.0023, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 4581 + }, + { + "completion_length": 1225.8333740234375, + "epoch": 0.6984756097560976, + "grad_norm": 0.137107996679394, + "kl": 0.100341796875, + "learning_rate": 7.569229303855776e-07, + "loss": 0.004, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4582 + }, + { + "completion_length": 1925.0000610351562, + "epoch": 0.6986280487804878, + "grad_norm": 0.06837356854119347, + "kl": 0.0557861328125, + "learning_rate": 7.562296873207946e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4583 + }, + { + "completion_length": 2222.666748046875, + "epoch": 0.698780487804878, + "grad_norm": 0.06789741357522688, + "kl": 0.0601806640625, + "learning_rate": 7.555366548499551e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4584 + }, + { + "completion_length": 985.5000305175781, + "epoch": 0.6989329268292683, + "grad_norm": 0.08288882932889834, + "kl": 0.0562744140625, + "learning_rate": 7.548438331692869e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4585 + }, + { + "completion_length": 2796.666748046875, + "epoch": 0.6990853658536585, + "grad_norm": 2.5387465045247795, + "kl": 0.0667724609375, + "learning_rate": 7.541512224749588e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4586 + }, + { + "completion_length": 2349.8334350585938, + "epoch": 0.6992378048780488, + "grad_norm": 0.08807482321750802, + "kl": 0.057861328125, + "learning_rate": 7.534588229630781e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4587 + }, + { + "completion_length": 1462.0000915527344, + "epoch": 0.699390243902439, + "grad_norm": 0.09300736906168856, + "kl": 0.067626953125, + "learning_rate": 7.527666348296941e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4588 + }, + { + "completion_length": 1597.8333740234375, + "epoch": 0.6995426829268293, + "grad_norm": 0.10290991528379874, + "kl": 0.0499267578125, + "learning_rate": 7.520746582707953e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4589 + }, + { + "completion_length": 1855.6667175292969, + "epoch": 0.6996951219512195, + "grad_norm": 0.08448318721075557, + "kl": 0.0509033203125, + "learning_rate": 7.513828934823112e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4590 + }, + { + "completion_length": 1696.0000610351562, + "epoch": 0.6998475609756097, + "grad_norm": 0.08729080424886143, + "kl": 0.0869140625, + "learning_rate": 7.506913406601098e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4591 + }, + { + "completion_length": 1128.8333740234375, + "epoch": 0.7, + "grad_norm": 0.09491513824751867, + "kl": 0.05615234375, + "learning_rate": 7.500000000000003e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4592 + }, + { + "completion_length": 1849.3333740234375, + "epoch": 0.7001524390243903, + "grad_norm": 0.08289066566104915, + "kl": 0.056640625, + "learning_rate": 7.493088716977317e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4593 + }, + { + "completion_length": 1361.8333740234375, + "epoch": 0.7003048780487805, + "grad_norm": 0.09430301372268626, + "kl": 0.0740966796875, + "learning_rate": 7.486179559489926e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4594 + }, + { + "completion_length": 1482.6666870117188, + "epoch": 0.7004573170731707, + "grad_norm": 0.11364904982184879, + "kl": 0.072021484375, + "learning_rate": 7.479272529494117e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4595 + }, + { + "completion_length": 1437.0000610351562, + "epoch": 0.7006097560975609, + "grad_norm": 0.14488314191706636, + "kl": 0.086669921875, + "learning_rate": 7.472367628945564e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4596 + }, + { + "completion_length": 1379.166748046875, + "epoch": 0.7007621951219513, + "grad_norm": 0.10132424938328091, + "kl": 0.0667724609375, + "learning_rate": 7.465464859799356e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4597 + }, + { + "completion_length": 3114.666748046875, + "epoch": 0.7009146341463415, + "grad_norm": 0.14817023390021325, + "kl": 0.0665283203125, + "learning_rate": 7.458564224009966e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4598 + }, + { + "completion_length": 2590.166748046875, + "epoch": 0.7010670731707317, + "grad_norm": 0.10172781423449886, + "kl": 0.0682373046875, + "learning_rate": 7.451665723531276e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4599 + }, + { + "completion_length": 1366.6667175292969, + "epoch": 0.7012195121951219, + "grad_norm": 0.0912854259381043, + "kl": 0.0614013671875, + "learning_rate": 7.444769360316534e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4600 + }, + { + "completion_length": 987.5000305175781, + "epoch": 0.7013719512195122, + "grad_norm": 0.20164089245754996, + "kl": 0.078857421875, + "learning_rate": 7.437875136318416e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4601 + }, + { + "completion_length": 1978.3333740234375, + "epoch": 0.7015243902439025, + "grad_norm": 1.7701966411071477, + "kl": 0.0849609375, + "learning_rate": 7.430983053488973e-07, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4602 + }, + { + "completion_length": 846.6666870117188, + "epoch": 0.7016768292682927, + "grad_norm": 0.3096057028022325, + "kl": 0.095458984375, + "learning_rate": 7.424093113779659e-07, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4603 + }, + { + "completion_length": 1187.0000610351562, + "epoch": 0.7018292682926829, + "grad_norm": 0.1512483331378574, + "kl": 0.071533203125, + "learning_rate": 7.417205319141321e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4604 + }, + { + "completion_length": 2272.3333740234375, + "epoch": 0.7019817073170732, + "grad_norm": 0.11084995336673052, + "kl": 0.067138671875, + "learning_rate": 7.410319671524185e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4605 + }, + { + "completion_length": 958.6666870117188, + "epoch": 0.7021341463414634, + "grad_norm": 0.13929720573672189, + "kl": 0.0771484375, + "learning_rate": 7.403436172877885e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4606 + }, + { + "completion_length": 867.6666870117188, + "epoch": 0.7022865853658536, + "grad_norm": 0.0881866176663443, + "kl": 0.044677734375, + "learning_rate": 7.396554825151442e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4607 + }, + { + "completion_length": 1979.3334350585938, + "epoch": 0.7024390243902439, + "grad_norm": 0.11402966825520759, + "kl": 0.0504150390625, + "learning_rate": 7.389675630293269e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4608 + }, + { + "completion_length": 985.3333740234375, + "epoch": 0.7025914634146342, + "grad_norm": 0.12140953085592252, + "kl": 0.0673828125, + "learning_rate": 7.382798590251161e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4609 + }, + { + "completion_length": 2305.0000610351562, + "epoch": 0.7027439024390244, + "grad_norm": 0.08181708495142957, + "kl": 0.050048828125, + "learning_rate": 7.37592370697231e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4610 + }, + { + "completion_length": 1942.6666870117188, + "epoch": 0.7028963414634146, + "grad_norm": 0.1338517538972224, + "kl": 0.086181640625, + "learning_rate": 7.3690509824033e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4611 + }, + { + "completion_length": 2843.666748046875, + "epoch": 0.7030487804878048, + "grad_norm": 1.760680628271267, + "kl": 0.056396484375, + "learning_rate": 7.362180418490099e-07, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4612 + }, + { + "completion_length": 1377.5000305175781, + "epoch": 0.7032012195121952, + "grad_norm": 0.13620317342711843, + "kl": 0.084716796875, + "learning_rate": 7.355312017178069e-07, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4613 + }, + { + "completion_length": 2371.666748046875, + "epoch": 0.7033536585365854, + "grad_norm": 1.0256557132008106, + "kl": 0.0665283203125, + "learning_rate": 7.348445780411939e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4614 + }, + { + "completion_length": 2789.666748046875, + "epoch": 0.7035060975609756, + "grad_norm": 0.051020059059984135, + "kl": 0.0618896484375, + "learning_rate": 7.341581710135859e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4615 + }, + { + "completion_length": 905.5, + "epoch": 0.7036585365853658, + "grad_norm": 0.0771093469569818, + "kl": 0.046875, + "learning_rate": 7.334719808293342e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4616 + }, + { + "completion_length": 1226.6666870117188, + "epoch": 0.7038109756097561, + "grad_norm": 0.0906660050963505, + "kl": 0.0687255859375, + "learning_rate": 7.327860076827298e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4617 + }, + { + "completion_length": 1019.3333740234375, + "epoch": 0.7039634146341464, + "grad_norm": 0.08036572759590906, + "kl": 0.0517578125, + "learning_rate": 7.321002517680007e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4618 + }, + { + "completion_length": 1940.0000915527344, + "epoch": 0.7041158536585366, + "grad_norm": 0.0909055753458958, + "kl": 0.0645751953125, + "learning_rate": 7.314147132793146e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4619 + }, + { + "completion_length": 1931.666748046875, + "epoch": 0.7042682926829268, + "grad_norm": 0.05448188340522849, + "kl": 0.051513671875, + "learning_rate": 7.307293924107781e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4620 + }, + { + "completion_length": 1119.3333740234375, + "epoch": 0.7044207317073171, + "grad_norm": 0.09962681006723155, + "kl": 0.0771484375, + "learning_rate": 7.300442893564357e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4621 + }, + { + "completion_length": 1417.6666870117188, + "epoch": 0.7045731707317073, + "grad_norm": 0.15191975563615337, + "kl": 0.06396484375, + "learning_rate": 7.29359404310269e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4622 + }, + { + "completion_length": 1865.0000610351562, + "epoch": 0.7047256097560975, + "grad_norm": 1.488638601564182, + "kl": 0.071533203125, + "learning_rate": 7.286747374661997e-07, + "loss": 0.0029, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4623 + }, + { + "completion_length": 1667.5, + "epoch": 0.7048780487804878, + "grad_norm": 0.10281420138555195, + "kl": 0.071533203125, + "learning_rate": 7.279902890180865e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4624 + }, + { + "completion_length": 1409.0000457763672, + "epoch": 0.7050304878048781, + "grad_norm": 2.6291339511016862, + "kl": 0.092529296875, + "learning_rate": 7.273060591597273e-07, + "loss": 0.0037, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4625 + }, + { + "completion_length": 1169.3333740234375, + "epoch": 0.7051829268292683, + "grad_norm": 0.13947139693736552, + "kl": 0.10986328125, + "learning_rate": 7.266220480848578e-07, + "loss": 0.0044, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4626 + }, + { + "completion_length": 1698.5000915527344, + "epoch": 0.7053353658536585, + "grad_norm": 0.11064299892158122, + "kl": 0.081298828125, + "learning_rate": 7.259382559871503e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4627 + }, + { + "completion_length": 1836.8333740234375, + "epoch": 0.7054878048780487, + "grad_norm": 1.0516560349664126, + "kl": 0.0673828125, + "learning_rate": 7.252546830602171e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4628 + }, + { + "completion_length": 2168.166748046875, + "epoch": 0.7056402439024391, + "grad_norm": 0.11466687859450289, + "kl": 0.0582275390625, + "learning_rate": 7.245713294976073e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4629 + }, + { + "completion_length": 946.5, + "epoch": 0.7057926829268293, + "grad_norm": 0.09093397293365565, + "kl": 0.0340576171875, + "learning_rate": 7.238881954928084e-07, + "loss": 0.0014, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4630 + }, + { + "completion_length": 1233.1666870117188, + "epoch": 0.7059451219512195, + "grad_norm": 0.09201450202204983, + "kl": 0.065185546875, + "learning_rate": 7.232052812392463e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4631 + }, + { + "completion_length": 2124.6666870117188, + "epoch": 0.7060975609756097, + "grad_norm": 0.10467691848445891, + "kl": 0.068603515625, + "learning_rate": 7.225225869302818e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4632 + }, + { + "completion_length": 2337.5, + "epoch": 0.70625, + "grad_norm": 0.0632916400873016, + "kl": 0.0506591796875, + "learning_rate": 7.218401127592175e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4633 + }, + { + "completion_length": 2296.666748046875, + "epoch": 0.7064024390243903, + "grad_norm": 1.6971911391842889, + "kl": 0.07177734375, + "learning_rate": 7.211578589192914e-07, + "loss": 0.0029, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4634 + }, + { + "completion_length": 1417.3333740234375, + "epoch": 0.7065548780487805, + "grad_norm": 0.23053261256917684, + "kl": 0.080322265625, + "learning_rate": 7.204758256036795e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4635 + }, + { + "completion_length": 960.6667175292969, + "epoch": 0.7067073170731707, + "grad_norm": 0.1619537296376593, + "kl": 0.06396484375, + "learning_rate": 7.197940130054943e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4636 + }, + { + "completion_length": 717.8333435058594, + "epoch": 0.706859756097561, + "grad_norm": 1.8567254743642916, + "kl": 0.124267578125, + "learning_rate": 7.191124213177878e-07, + "loss": 0.005, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4637 + }, + { + "completion_length": 2962.166748046875, + "epoch": 0.7070121951219512, + "grad_norm": 0.05740343419568686, + "kl": 0.0599365234375, + "learning_rate": 7.184310507335478e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4638 + }, + { + "completion_length": 1125.5000610351562, + "epoch": 0.7071646341463415, + "grad_norm": 9.53242237716064, + "kl": 0.125, + "learning_rate": 7.177499014457013e-07, + "loss": 0.005, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4639 + }, + { + "completion_length": 1289.5000610351562, + "epoch": 0.7073170731707317, + "grad_norm": 1.3371268839775965, + "kl": 0.060302734375, + "learning_rate": 7.1706897364711e-07, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4640 + }, + { + "completion_length": 1726.1666870117188, + "epoch": 0.707469512195122, + "grad_norm": 0.08752764670955412, + "kl": 0.04541015625, + "learning_rate": 7.16388267530575e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4641 + }, + { + "completion_length": 1259.6667175292969, + "epoch": 0.7076219512195122, + "grad_norm": 1.458142054261279, + "kl": 0.0675048828125, + "learning_rate": 7.15707783288834e-07, + "loss": 0.0027, + "reward": 0.8333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/format_reward": 0.0, + "step": 4642 + }, + { + "completion_length": 1385.6667175292969, + "epoch": 0.7077743902439024, + "grad_norm": 0.1258266167617879, + "kl": 0.0966796875, + "learning_rate": 7.150275211145622e-07, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4643 + }, + { + "completion_length": 1161.5000305175781, + "epoch": 0.7079268292682926, + "grad_norm": 0.11168073531983745, + "kl": 0.06591796875, + "learning_rate": 7.143474812003715e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4644 + }, + { + "completion_length": 1086.1666870117188, + "epoch": 0.708079268292683, + "grad_norm": 0.16734669613712277, + "kl": 0.07470703125, + "learning_rate": 7.136676637388107e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4645 + }, + { + "completion_length": 1459.3333740234375, + "epoch": 0.7082317073170732, + "grad_norm": 0.11684196354325015, + "kl": 0.091064453125, + "learning_rate": 7.129880689223657e-07, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4646 + }, + { + "completion_length": 1460.3333740234375, + "epoch": 0.7083841463414634, + "grad_norm": 1.6397192907908185, + "kl": 0.08154296875, + "learning_rate": 7.123086969434599e-07, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4647 + }, + { + "completion_length": 938.8333740234375, + "epoch": 0.7085365853658536, + "grad_norm": 0.09435903750782455, + "kl": 0.0552978515625, + "learning_rate": 7.116295479944533e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4648 + }, + { + "completion_length": 1545.3333740234375, + "epoch": 0.708689024390244, + "grad_norm": 1.1521219949518315, + "kl": 0.0693359375, + "learning_rate": 7.109506222676431e-07, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4649 + }, + { + "completion_length": 1225.1666870117188, + "epoch": 0.7088414634146342, + "grad_norm": 0.10756746390592592, + "kl": 0.074951171875, + "learning_rate": 7.102719199552619e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4650 + }, + { + "completion_length": 1798.5000610351562, + "epoch": 0.7089939024390244, + "grad_norm": 0.0670583940515013, + "kl": 0.0521240234375, + "learning_rate": 7.0959344124948e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4651 + }, + { + "completion_length": 940.3333435058594, + "epoch": 0.7091463414634146, + "grad_norm": 1.7772262544888062, + "kl": 0.078369140625, + "learning_rate": 7.089151863424061e-07, + "loss": 0.0031, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4652 + }, + { + "completion_length": 1546.8333740234375, + "epoch": 0.7092987804878049, + "grad_norm": 0.2761381235886902, + "kl": 0.09814453125, + "learning_rate": 7.082371554260823e-07, + "loss": 0.0039, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4653 + }, + { + "completion_length": 2019.0000610351562, + "epoch": 0.7094512195121951, + "grad_norm": 0.09240866801628472, + "kl": 0.0699462890625, + "learning_rate": 7.075593486924892e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4654 + }, + { + "completion_length": 2352.166717529297, + "epoch": 0.7096036585365854, + "grad_norm": 0.07696637504152551, + "kl": 0.0494384765625, + "learning_rate": 7.068817663335438e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4655 + }, + { + "completion_length": 2614.1666870117188, + "epoch": 0.7097560975609756, + "grad_norm": 0.06246554819634699, + "kl": 0.057861328125, + "learning_rate": 7.062044085410991e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4656 + }, + { + "completion_length": 2583.166748046875, + "epoch": 0.7099085365853659, + "grad_norm": 0.05108349277567801, + "kl": 0.052001953125, + "learning_rate": 7.055272755069453e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4657 + }, + { + "completion_length": 708.6666870117188, + "epoch": 0.7100609756097561, + "grad_norm": 1.8394948856181421, + "kl": 0.093017578125, + "learning_rate": 7.048503674228075e-07, + "loss": 0.0037, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4658 + }, + { + "completion_length": 3026.83349609375, + "epoch": 0.7102134146341463, + "grad_norm": 0.06608678603257363, + "kl": 0.0540771484375, + "learning_rate": 7.041736844803485e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4659 + }, + { + "completion_length": 2398.0, + "epoch": 0.7103658536585366, + "grad_norm": 0.10377293187658859, + "kl": 0.0657958984375, + "learning_rate": 7.034972268711669e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4660 + }, + { + "completion_length": 1008.3333740234375, + "epoch": 0.7105182926829269, + "grad_norm": 2.2144232912934623, + "kl": 0.096923828125, + "learning_rate": 7.028209947867973e-07, + "loss": 0.0039, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4661 + }, + { + "completion_length": 2245.0, + "epoch": 0.7106707317073171, + "grad_norm": 0.07315832972955297, + "kl": 0.0657958984375, + "learning_rate": 7.021449884187115e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4662 + }, + { + "completion_length": 1398.5000610351562, + "epoch": 0.7108231707317073, + "grad_norm": 0.07735874517311275, + "kl": 0.0479736328125, + "learning_rate": 7.01469207958315e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4663 + }, + { + "completion_length": 1933.5000610351562, + "epoch": 0.7109756097560975, + "grad_norm": 0.07944477642554568, + "kl": 0.041748046875, + "learning_rate": 7.007936535969516e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4664 + }, + { + "completion_length": 2192.6666870117188, + "epoch": 0.7111280487804879, + "grad_norm": 0.06798700457135574, + "kl": 0.0546875, + "learning_rate": 7.001183255259005e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4665 + }, + { + "completion_length": 2159.8333740234375, + "epoch": 0.7112804878048781, + "grad_norm": 1.8748395011408943, + "kl": 0.083984375, + "learning_rate": 6.994432239363762e-07, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4666 + }, + { + "completion_length": 971.8333435058594, + "epoch": 0.7114329268292683, + "grad_norm": 0.09789374312214495, + "kl": 0.05712890625, + "learning_rate": 6.987683490195305e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4667 + }, + { + "completion_length": 1766.3334350585938, + "epoch": 0.7115853658536585, + "grad_norm": 0.08234417687106525, + "kl": 0.064208984375, + "learning_rate": 6.980937009664487e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4668 + }, + { + "completion_length": 3545.8333740234375, + "epoch": 0.7117378048780488, + "grad_norm": 0.051320703525755164, + "kl": 0.051513671875, + "learning_rate": 6.974192799681533e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4669 + }, + { + "completion_length": 2832.3333740234375, + "epoch": 0.711890243902439, + "grad_norm": 0.07824450108483544, + "kl": 0.05859375, + "learning_rate": 6.96745086215604e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4670 + }, + { + "completion_length": 2816.3333740234375, + "epoch": 0.7120426829268293, + "grad_norm": 0.04388230519791293, + "kl": 0.056396484375, + "learning_rate": 6.960711198996929e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4671 + }, + { + "completion_length": 2544.0000610351562, + "epoch": 0.7121951219512195, + "grad_norm": 0.07419133363331, + "kl": 0.0614013671875, + "learning_rate": 6.9539738121125e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4672 + }, + { + "completion_length": 2136.166748046875, + "epoch": 0.7123475609756098, + "grad_norm": 0.07938590167370947, + "kl": 0.05419921875, + "learning_rate": 6.947238703410402e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4673 + }, + { + "completion_length": 2307.5, + "epoch": 0.7125, + "grad_norm": 0.08032501087165325, + "kl": 0.059326171875, + "learning_rate": 6.94050587479764e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4674 + }, + { + "completion_length": 3023.5, + "epoch": 0.7126524390243902, + "grad_norm": 0.042844018183728225, + "kl": 0.0491943359375, + "learning_rate": 6.933775328180577e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4675 + }, + { + "completion_length": 2469.666748046875, + "epoch": 0.7128048780487805, + "grad_norm": 0.08179739287463077, + "kl": 0.0650634765625, + "learning_rate": 6.927047065464915e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4676 + }, + { + "completion_length": 2330.0001220703125, + "epoch": 0.7129573170731708, + "grad_norm": 0.10662049515637365, + "kl": 0.060546875, + "learning_rate": 6.920321088555726e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4677 + }, + { + "completion_length": 2083.166748046875, + "epoch": 0.713109756097561, + "grad_norm": 0.07212819487914983, + "kl": 0.0521240234375, + "learning_rate": 6.913597399357426e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4678 + }, + { + "completion_length": 2530.5000610351562, + "epoch": 0.7132621951219512, + "grad_norm": 0.7402743970185003, + "kl": 0.0450439453125, + "learning_rate": 6.906875999773788e-07, + "loss": 0.0018, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4679 + }, + { + "completion_length": 2235.3333740234375, + "epoch": 0.7134146341463414, + "grad_norm": 0.07992859015978873, + "kl": 0.0780029296875, + "learning_rate": 6.90015689170794e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4680 + }, + { + "completion_length": 2508.0001220703125, + "epoch": 0.7135670731707318, + "grad_norm": 0.21256878766951512, + "kl": 0.065185546875, + "learning_rate": 6.893440077062346e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4681 + }, + { + "completion_length": 1865.8333740234375, + "epoch": 0.713719512195122, + "grad_norm": 0.10463775989023019, + "kl": 0.063720703125, + "learning_rate": 6.886725557738831e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4682 + }, + { + "completion_length": 2617.6666870117188, + "epoch": 0.7138719512195122, + "grad_norm": 0.07066324309309803, + "kl": 0.057373046875, + "learning_rate": 6.880013335638573e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4683 + }, + { + "completion_length": 2260.3333740234375, + "epoch": 0.7140243902439024, + "grad_norm": 0.9224499905817833, + "kl": 0.0628662109375, + "learning_rate": 6.873303412662103e-07, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4684 + }, + { + "completion_length": 2882.666748046875, + "epoch": 0.7141768292682927, + "grad_norm": 0.7127421840977657, + "kl": 0.0443115234375, + "learning_rate": 6.866595790709279e-07, + "loss": 0.0018, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4685 + }, + { + "completion_length": 2763.0001220703125, + "epoch": 0.714329268292683, + "grad_norm": 0.05226639411335388, + "kl": 0.0545654296875, + "learning_rate": 6.859890471679331e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4686 + }, + { + "completion_length": 2694.5001220703125, + "epoch": 0.7144817073170732, + "grad_norm": 0.10721496251302944, + "kl": 0.0638427734375, + "learning_rate": 6.853187457470828e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4687 + }, + { + "completion_length": 1512.5000610351562, + "epoch": 0.7146341463414634, + "grad_norm": 0.08827273883362154, + "kl": 0.0565185546875, + "learning_rate": 6.846486749981684e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4688 + }, + { + "completion_length": 991.0, + "epoch": 0.7147865853658537, + "grad_norm": 0.10335916733892744, + "kl": 0.05029296875, + "learning_rate": 6.839788351109167e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4689 + }, + { + "completion_length": 1140.8333740234375, + "epoch": 0.7149390243902439, + "grad_norm": 0.11494267716019568, + "kl": 0.06494140625, + "learning_rate": 6.833092262749884e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4690 + }, + { + "completion_length": 1988.0000610351562, + "epoch": 0.7150914634146341, + "grad_norm": 0.11134877498748333, + "kl": 0.0592041015625, + "learning_rate": 6.826398486799792e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4691 + }, + { + "completion_length": 1869.1666870117188, + "epoch": 0.7152439024390244, + "grad_norm": 0.18824527102774202, + "kl": 0.0689697265625, + "learning_rate": 6.819707025154194e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4692 + }, + { + "completion_length": 2412.0, + "epoch": 0.7153963414634147, + "grad_norm": 1.2506909888636542, + "kl": 0.0643310546875, + "learning_rate": 6.813017879707737e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4693 + }, + { + "completion_length": 2568.5, + "epoch": 0.7155487804878049, + "grad_norm": 0.08372261361400232, + "kl": 0.067626953125, + "learning_rate": 6.806331052354401e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4694 + }, + { + "completion_length": 1254.5, + "epoch": 0.7157012195121951, + "grad_norm": 0.06731183612898299, + "kl": 0.044189453125, + "learning_rate": 6.799646544987529e-07, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4695 + }, + { + "completion_length": 2775.166748046875, + "epoch": 0.7158536585365853, + "grad_norm": 0.10514094624230864, + "kl": 0.0615234375, + "learning_rate": 6.792964359499794e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4696 + }, + { + "completion_length": 2501.3333740234375, + "epoch": 0.7160060975609757, + "grad_norm": 1.0033981405179282, + "kl": 0.058349609375, + "learning_rate": 6.786284497783215e-07, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4697 + }, + { + "completion_length": 1038.5, + "epoch": 0.7161585365853659, + "grad_norm": 1.77680182147055, + "kl": 0.07373046875, + "learning_rate": 6.779606961729162e-07, + "loss": 0.0029, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 4698 + }, + { + "completion_length": 2640.5, + "epoch": 0.7163109756097561, + "grad_norm": 0.08324270635880869, + "kl": 0.062744140625, + "learning_rate": 6.772931753228325e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4699 + }, + { + "completion_length": 2910.166748046875, + "epoch": 0.7164634146341463, + "grad_norm": 0.0442367994226405, + "kl": 0.0401611328125, + "learning_rate": 6.766258874170752e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4700 + }, + { + "completion_length": 3425.8333740234375, + "epoch": 0.7166158536585366, + "grad_norm": 0.8443594468620182, + "kl": 0.0447998046875, + "learning_rate": 6.759588326445829e-07, + "loss": 0.0018, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4701 + }, + { + "completion_length": 1041.0000305175781, + "epoch": 0.7167682926829269, + "grad_norm": 0.09636489109693247, + "kl": 0.0574951171875, + "learning_rate": 6.752920111942287e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4702 + }, + { + "completion_length": 2153.0000610351562, + "epoch": 0.7169207317073171, + "grad_norm": 0.14302935092528818, + "kl": 0.081298828125, + "learning_rate": 6.746254232548175e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4703 + }, + { + "completion_length": 1501.0000610351562, + "epoch": 0.7170731707317073, + "grad_norm": 0.09921890869112902, + "kl": 0.0673828125, + "learning_rate": 6.739590690150903e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4704 + }, + { + "completion_length": 986.6667175292969, + "epoch": 0.7172256097560976, + "grad_norm": 0.16949078416693789, + "kl": 0.074462890625, + "learning_rate": 6.732929486637211e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4705 + }, + { + "completion_length": 1455.3333740234375, + "epoch": 0.7173780487804878, + "grad_norm": 0.14125326486199713, + "kl": 0.0618896484375, + "learning_rate": 6.726270623893179e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4706 + }, + { + "completion_length": 2215.166748046875, + "epoch": 0.717530487804878, + "grad_norm": 0.10017184760222786, + "kl": 0.069091796875, + "learning_rate": 6.71961410380422e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4707 + }, + { + "completion_length": 1668.8333435058594, + "epoch": 0.7176829268292683, + "grad_norm": 0.07365050534323139, + "kl": 0.0484619140625, + "learning_rate": 6.712959928255088e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4708 + }, + { + "completion_length": 1972.3333740234375, + "epoch": 0.7178353658536586, + "grad_norm": 1.9504293765751939, + "kl": 0.066162109375, + "learning_rate": 6.70630809912987e-07, + "loss": 0.0026, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 4709 + }, + { + "completion_length": 1726.1666870117188, + "epoch": 0.7179878048780488, + "grad_norm": 0.09529131657278476, + "kl": 0.050048828125, + "learning_rate": 6.699658618311991e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4710 + }, + { + "completion_length": 1586.3333435058594, + "epoch": 0.718140243902439, + "grad_norm": 0.12800607022937618, + "kl": 0.0621337890625, + "learning_rate": 6.693011487684218e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4711 + }, + { + "completion_length": 1596.666748046875, + "epoch": 0.7182926829268292, + "grad_norm": 0.08073368255388357, + "kl": 0.06884765625, + "learning_rate": 6.686366709128632e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4712 + }, + { + "completion_length": 977.1666870117188, + "epoch": 0.7184451219512196, + "grad_norm": 0.11211595561071881, + "kl": 0.077880859375, + "learning_rate": 6.679724284526663e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4713 + }, + { + "completion_length": 1058.5, + "epoch": 0.7185975609756098, + "grad_norm": 0.07604378821154763, + "kl": 0.039306640625, + "learning_rate": 6.673084215759075e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4714 + }, + { + "completion_length": 935.6666870117188, + "epoch": 0.71875, + "grad_norm": 0.0877845335562171, + "kl": 0.053466796875, + "learning_rate": 6.666446504705971e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4715 + }, + { + "completion_length": 2233.166748046875, + "epoch": 0.7189024390243902, + "grad_norm": 0.07781817165322866, + "kl": 0.07861328125, + "learning_rate": 6.65981115324676e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4716 + }, + { + "completion_length": 2108.8333740234375, + "epoch": 0.7190548780487804, + "grad_norm": 0.3042968250959227, + "kl": 0.072509765625, + "learning_rate": 6.653178163260211e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4717 + }, + { + "completion_length": 1136.1667175292969, + "epoch": 0.7192073170731708, + "grad_norm": 0.17312270804669128, + "kl": 0.0751953125, + "learning_rate": 6.646547536624414e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4718 + }, + { + "completion_length": 2513.8334350585938, + "epoch": 0.719359756097561, + "grad_norm": 0.06293129664290087, + "kl": 0.0606689453125, + "learning_rate": 6.639919275216787e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4719 + }, + { + "completion_length": 2129.666748046875, + "epoch": 0.7195121951219512, + "grad_norm": 0.07124908105654613, + "kl": 0.0584716796875, + "learning_rate": 6.633293380914087e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4720 + }, + { + "completion_length": 3264.666748046875, + "epoch": 0.7196646341463414, + "grad_norm": 0.06285613745332085, + "kl": 0.0509033203125, + "learning_rate": 6.626669855592385e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4721 + }, + { + "completion_length": 1126.166748046875, + "epoch": 0.7198170731707317, + "grad_norm": 0.1092540683923896, + "kl": 0.057373046875, + "learning_rate": 6.620048701127097e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4722 + }, + { + "completion_length": 1325.3333740234375, + "epoch": 0.719969512195122, + "grad_norm": 0.08044948326363988, + "kl": 0.0640869140625, + "learning_rate": 6.613429919392959e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4723 + }, + { + "completion_length": 2967.8333740234375, + "epoch": 0.7201219512195122, + "grad_norm": 0.05813771023452128, + "kl": 0.0487060546875, + "learning_rate": 6.60681351226404e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4724 + }, + { + "completion_length": 1624.6666870117188, + "epoch": 0.7202743902439024, + "grad_norm": 1.394760535640775, + "kl": 0.0579833984375, + "learning_rate": 6.600199481613735e-07, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4725 + }, + { + "completion_length": 1809.166748046875, + "epoch": 0.7204268292682927, + "grad_norm": 0.17775661848995003, + "kl": 0.0545654296875, + "learning_rate": 6.593587829314765e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4726 + }, + { + "completion_length": 1136.3333740234375, + "epoch": 0.7205792682926829, + "grad_norm": 2.008281331084773, + "kl": 0.0645751953125, + "learning_rate": 6.586978557239179e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4727 + }, + { + "completion_length": 1833.1666870117188, + "epoch": 0.7207317073170731, + "grad_norm": 1.0620307775833808, + "kl": 0.0657958984375, + "learning_rate": 6.580371667258349e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4728 + }, + { + "completion_length": 1771.666748046875, + "epoch": 0.7208841463414634, + "grad_norm": 0.1871626176302452, + "kl": 0.0830078125, + "learning_rate": 6.573767161242982e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4729 + }, + { + "completion_length": 2300.5000610351562, + "epoch": 0.7210365853658537, + "grad_norm": 0.07853721902506022, + "kl": 0.0635986328125, + "learning_rate": 6.567165041063093e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4730 + }, + { + "completion_length": 1455.1666870117188, + "epoch": 0.7211890243902439, + "grad_norm": 0.09166670716749202, + "kl": 0.0457763671875, + "learning_rate": 6.560565308588037e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4731 + }, + { + "completion_length": 916.3333740234375, + "epoch": 0.7213414634146341, + "grad_norm": 0.11744369419884473, + "kl": 0.0718994140625, + "learning_rate": 6.553967965686483e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4732 + }, + { + "completion_length": 2238.8333435058594, + "epoch": 0.7214939024390243, + "grad_norm": 0.08867741799266105, + "kl": 0.05029296875, + "learning_rate": 6.547373014226437e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4733 + }, + { + "completion_length": 1365.8333740234375, + "epoch": 0.7216463414634147, + "grad_norm": 0.1188536227782415, + "kl": 0.0716552734375, + "learning_rate": 6.540780456075207e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4734 + }, + { + "completion_length": 1785.5, + "epoch": 0.7217987804878049, + "grad_norm": 0.06472974645945845, + "kl": 0.04388427734375, + "learning_rate": 6.534190293099439e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4735 + }, + { + "completion_length": 2060.6666870117188, + "epoch": 0.7219512195121951, + "grad_norm": 0.08959461153336556, + "kl": 0.0859375, + "learning_rate": 6.527602527165099e-07, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4736 + }, + { + "completion_length": 1070.3333435058594, + "epoch": 0.7221036585365853, + "grad_norm": 0.06775609983007803, + "kl": 0.0465087890625, + "learning_rate": 6.521017160137471e-07, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4737 + }, + { + "completion_length": 629.8333435058594, + "epoch": 0.7222560975609756, + "grad_norm": 0.16341432435244121, + "kl": 0.0745849609375, + "learning_rate": 6.514434193881165e-07, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4738 + }, + { + "completion_length": 2140.0000610351562, + "epoch": 0.7224085365853659, + "grad_norm": 0.17629039076397854, + "kl": 0.0748291015625, + "learning_rate": 6.507853630260096e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4739 + }, + { + "completion_length": 978.5, + "epoch": 0.7225609756097561, + "grad_norm": 0.07755823581028265, + "kl": 0.0452880859375, + "learning_rate": 6.501275471137518e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4740 + }, + { + "completion_length": 1306.166748046875, + "epoch": 0.7227134146341463, + "grad_norm": 0.1321980220170242, + "kl": 0.07958984375, + "learning_rate": 6.494699718375991e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4741 + }, + { + "completion_length": 1428.0, + "epoch": 0.7228658536585366, + "grad_norm": 0.16877362509137533, + "kl": 0.07666015625, + "learning_rate": 6.488126373837405e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4742 + }, + { + "completion_length": 1374.166748046875, + "epoch": 0.7230182926829268, + "grad_norm": 0.1333190027475878, + "kl": 0.0626220703125, + "learning_rate": 6.481555439382956e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4743 + }, + { + "completion_length": 1350.5000305175781, + "epoch": 0.723170731707317, + "grad_norm": 0.1098509536295081, + "kl": 0.07080078125, + "learning_rate": 6.474986916873168e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4744 + }, + { + "completion_length": 2429.5000610351562, + "epoch": 0.7233231707317073, + "grad_norm": 1.9074156997712568, + "kl": 0.0670166015625, + "learning_rate": 6.468420808167872e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4745 + }, + { + "completion_length": 2451.166748046875, + "epoch": 0.7234756097560976, + "grad_norm": 0.06385086199279115, + "kl": 0.0546875, + "learning_rate": 6.461857115126225e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4746 + }, + { + "completion_length": 1565.5, + "epoch": 0.7236280487804878, + "grad_norm": 0.31978523751893356, + "kl": 0.088623046875, + "learning_rate": 6.455295839606701e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4747 + }, + { + "completion_length": 1261.3333740234375, + "epoch": 0.723780487804878, + "grad_norm": 0.10219596327072, + "kl": 0.063720703125, + "learning_rate": 6.448736983467072e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4748 + }, + { + "completion_length": 2631.6666870117188, + "epoch": 0.7239329268292682, + "grad_norm": 0.08266739575427426, + "kl": 0.0614013671875, + "learning_rate": 6.442180548564443e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4749 + }, + { + "completion_length": 1260.5, + "epoch": 0.7240853658536586, + "grad_norm": 0.1066576338052188, + "kl": 0.0648193359375, + "learning_rate": 6.435626536755228e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4750 + }, + { + "completion_length": 1771.3333740234375, + "epoch": 0.7242378048780488, + "grad_norm": 0.6202816933458557, + "kl": 0.09228515625, + "learning_rate": 6.429074949895162e-07, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4751 + }, + { + "completion_length": 758.3333435058594, + "epoch": 0.724390243902439, + "grad_norm": 0.11838688265147851, + "kl": 0.061767578125, + "learning_rate": 6.422525789839273e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4752 + }, + { + "completion_length": 1569.5000610351562, + "epoch": 0.7245426829268292, + "grad_norm": 0.09316049784558615, + "kl": 0.072021484375, + "learning_rate": 6.41597905844192e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4753 + }, + { + "completion_length": 2272.0001220703125, + "epoch": 0.7246951219512195, + "grad_norm": 0.11245315205582121, + "kl": 0.0748291015625, + "learning_rate": 6.409434757556771e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4754 + }, + { + "completion_length": 831.6667175292969, + "epoch": 0.7248475609756098, + "grad_norm": 2.0733193364793956, + "kl": 0.06982421875, + "learning_rate": 6.402892889036804e-07, + "loss": 0.0028, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4755 + }, + { + "completion_length": 2680.0001220703125, + "epoch": 0.725, + "grad_norm": 0.09568323216934839, + "kl": 0.057861328125, + "learning_rate": 6.396353454734313e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4756 + }, + { + "completion_length": 996.5, + "epoch": 0.7251524390243902, + "grad_norm": 0.15419306999440666, + "kl": 0.071533203125, + "learning_rate": 6.389816456500888e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4757 + }, + { + "completion_length": 2254.8333740234375, + "epoch": 0.7253048780487805, + "grad_norm": 0.09375127636844005, + "kl": 0.064453125, + "learning_rate": 6.383281896187444e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4758 + }, + { + "completion_length": 1305.666748046875, + "epoch": 0.7254573170731707, + "grad_norm": 1.5836607698209713, + "kl": 0.094970703125, + "learning_rate": 6.376749775644202e-07, + "loss": 0.0038, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4759 + }, + { + "completion_length": 1061.5000610351562, + "epoch": 0.725609756097561, + "grad_norm": 1.4791917926031806, + "kl": 0.08837890625, + "learning_rate": 6.370220096720692e-07, + "loss": 0.0035, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4760 + }, + { + "completion_length": 1556.6666870117188, + "epoch": 0.7257621951219512, + "grad_norm": 0.1707309239138093, + "kl": 0.08349609375, + "learning_rate": 6.363692861265754e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4761 + }, + { + "completion_length": 1281.3333435058594, + "epoch": 0.7259146341463415, + "grad_norm": 0.08692947326263291, + "kl": 0.07421875, + "learning_rate": 6.357168071127524e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4762 + }, + { + "completion_length": 695.3333740234375, + "epoch": 0.7260670731707317, + "grad_norm": 0.10461215736941604, + "kl": 0.04833984375, + "learning_rate": 6.350645728153466e-07, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4763 + }, + { + "completion_length": 1046.0000610351562, + "epoch": 0.7262195121951219, + "grad_norm": 0.11439968477436789, + "kl": 0.0704345703125, + "learning_rate": 6.344125834190345e-07, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4764 + }, + { + "completion_length": 2198.0001220703125, + "epoch": 0.7263719512195121, + "grad_norm": 0.12681357686143704, + "kl": 0.0562744140625, + "learning_rate": 6.337608391084215e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4765 + }, + { + "completion_length": 2681.8333740234375, + "epoch": 0.7265243902439025, + "grad_norm": 0.8709022229109444, + "kl": 0.072509765625, + "learning_rate": 6.331093400680457e-07, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4766 + }, + { + "completion_length": 1510.3333740234375, + "epoch": 0.7266768292682927, + "grad_norm": 0.09227032205049383, + "kl": 0.0615234375, + "learning_rate": 6.324580864823749e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4767 + }, + { + "completion_length": 2149.8333435058594, + "epoch": 0.7268292682926829, + "grad_norm": 0.06278238678681253, + "kl": 0.049072265625, + "learning_rate": 6.318070785358074e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4768 + }, + { + "completion_length": 1731.8333740234375, + "epoch": 0.7269817073170731, + "grad_norm": 0.1003994664497865, + "kl": 0.07080078125, + "learning_rate": 6.311563164126729e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4769 + }, + { + "completion_length": 2490.0000610351562, + "epoch": 0.7271341463414634, + "grad_norm": 0.9100429241723886, + "kl": 0.0628662109375, + "learning_rate": 6.305058002972292e-07, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4770 + }, + { + "completion_length": 3045.3333740234375, + "epoch": 0.7272865853658537, + "grad_norm": 0.07859597056405378, + "kl": 0.059814453125, + "learning_rate": 6.298555303736666e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4771 + }, + { + "completion_length": 1322.8333740234375, + "epoch": 0.7274390243902439, + "grad_norm": 0.26148443829851015, + "kl": 0.0679931640625, + "learning_rate": 6.292055068261051e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4772 + }, + { + "completion_length": 1635.3333740234375, + "epoch": 0.7275914634146341, + "grad_norm": 0.08623947759891834, + "kl": 0.060302734375, + "learning_rate": 6.285557298385946e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4773 + }, + { + "completion_length": 1801.666748046875, + "epoch": 0.7277439024390244, + "grad_norm": 0.07457677781245603, + "kl": 0.055908203125, + "learning_rate": 6.279061995951164e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4774 + }, + { + "completion_length": 3102.666748046875, + "epoch": 0.7278963414634146, + "grad_norm": 0.21678947304716237, + "kl": 0.06884765625, + "learning_rate": 6.272569162795792e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4775 + }, + { + "completion_length": 1232.8333435058594, + "epoch": 0.7280487804878049, + "grad_norm": 0.13163132582331377, + "kl": 0.0662841796875, + "learning_rate": 6.266078800758249e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4776 + }, + { + "completion_length": 2379.666717529297, + "epoch": 0.7282012195121951, + "grad_norm": 0.7913687898186078, + "kl": 0.0572509765625, + "learning_rate": 6.259590911676232e-07, + "loss": 0.0023, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 4777 + }, + { + "completion_length": 3237.3333740234375, + "epoch": 0.7283536585365854, + "grad_norm": 0.09067006486106328, + "kl": 0.0616455078125, + "learning_rate": 6.253105497386752e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4778 + }, + { + "completion_length": 2042.8333740234375, + "epoch": 0.7285060975609756, + "grad_norm": 0.10327525140115403, + "kl": 0.084716796875, + "learning_rate": 6.246622559726117e-07, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4779 + }, + { + "completion_length": 3394.166748046875, + "epoch": 0.7286585365853658, + "grad_norm": 0.04088581361825427, + "kl": 0.0455322265625, + "learning_rate": 6.240142100529917e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4780 + }, + { + "completion_length": 2307.1666870117188, + "epoch": 0.728810975609756, + "grad_norm": 0.18065868278724298, + "kl": 0.0831298828125, + "learning_rate": 6.233664121633067e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4781 + }, + { + "completion_length": 1917.166748046875, + "epoch": 0.7289634146341464, + "grad_norm": 0.06031044659045523, + "kl": 0.0521240234375, + "learning_rate": 6.227188624869767e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4782 + }, + { + "completion_length": 694.6666870117188, + "epoch": 0.7291158536585366, + "grad_norm": 2.1917327970550184, + "kl": 0.104736328125, + "learning_rate": 6.220715612073505e-07, + "loss": 0.0042, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4783 + }, + { + "completion_length": 1914.0001220703125, + "epoch": 0.7292682926829268, + "grad_norm": 0.11440292033222989, + "kl": 0.06201171875, + "learning_rate": 6.214245085077078e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4784 + }, + { + "completion_length": 2147.5001220703125, + "epoch": 0.729420731707317, + "grad_norm": 0.08463767583354198, + "kl": 0.05859375, + "learning_rate": 6.207777045712575e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4785 + }, + { + "completion_length": 2242.8333740234375, + "epoch": 0.7295731707317074, + "grad_norm": 0.09543492648004029, + "kl": 0.05810546875, + "learning_rate": 6.201311495811381e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4786 + }, + { + "completion_length": 1191.3333740234375, + "epoch": 0.7297256097560976, + "grad_norm": 0.12210102702893322, + "kl": 0.05517578125, + "learning_rate": 6.194848437204185e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4787 + }, + { + "completion_length": 1691.666748046875, + "epoch": 0.7298780487804878, + "grad_norm": 0.10656778398101444, + "kl": 0.076171875, + "learning_rate": 6.188387871720946e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4788 + }, + { + "completion_length": 2393.5001220703125, + "epoch": 0.730030487804878, + "grad_norm": 0.06868615525820983, + "kl": 0.0552978515625, + "learning_rate": 6.181929801190943e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4789 + }, + { + "completion_length": 1825.0000915527344, + "epoch": 0.7301829268292683, + "grad_norm": 0.09342725242558221, + "kl": 0.0609130859375, + "learning_rate": 6.175474227442736e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4790 + }, + { + "completion_length": 2143.8333740234375, + "epoch": 0.7303353658536585, + "grad_norm": 0.08325865930122729, + "kl": 0.0670166015625, + "learning_rate": 6.169021152304182e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4791 + }, + { + "completion_length": 811.5000305175781, + "epoch": 0.7304878048780488, + "grad_norm": 0.12397089313305358, + "kl": 0.0703125, + "learning_rate": 6.162570577602433e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4792 + }, + { + "completion_length": 1359.6666870117188, + "epoch": 0.730640243902439, + "grad_norm": 0.10219905678623634, + "kl": 0.0599365234375, + "learning_rate": 6.15612250516392e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4793 + }, + { + "completion_length": 1935.0000915527344, + "epoch": 0.7307926829268293, + "grad_norm": 0.09346720784418001, + "kl": 0.05517578125, + "learning_rate": 6.149676936814377e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4794 + }, + { + "completion_length": 1864.6666870117188, + "epoch": 0.7309451219512195, + "grad_norm": 0.4003733888961887, + "kl": 0.048828125, + "learning_rate": 6.14323387437883e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4795 + }, + { + "completion_length": 2770.5001220703125, + "epoch": 0.7310975609756097, + "grad_norm": 0.05537685986612146, + "kl": 0.0478515625, + "learning_rate": 6.136793319681598e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4796 + }, + { + "completion_length": 1775.6667175292969, + "epoch": 0.73125, + "grad_norm": 0.12540580880432067, + "kl": 0.065185546875, + "learning_rate": 6.130355274546268e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4797 + }, + { + "completion_length": 1762.666748046875, + "epoch": 0.7314024390243903, + "grad_norm": 0.09811851237886182, + "kl": 0.0728759765625, + "learning_rate": 6.123919740795745e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4798 + }, + { + "completion_length": 1872.5000915527344, + "epoch": 0.7315548780487805, + "grad_norm": 0.1196599907761847, + "kl": 0.0545654296875, + "learning_rate": 6.117486720252199e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4799 + }, + { + "completion_length": 1203.0, + "epoch": 0.7317073170731707, + "grad_norm": 0.12870219190889418, + "kl": 0.06591796875, + "learning_rate": 6.11105621473712e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4800 + }, + { + "completion_length": 1566.1666870117188, + "epoch": 0.7318597560975609, + "grad_norm": 1.4461057990065362, + "kl": 0.058349609375, + "learning_rate": 6.104628226071247e-07, + "loss": 0.0023, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 4801 + }, + { + "completion_length": 2318.666748046875, + "epoch": 0.7320121951219513, + "grad_norm": 0.08670174616244523, + "kl": 0.0587158203125, + "learning_rate": 6.098202756074632e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4802 + }, + { + "completion_length": 1650.166748046875, + "epoch": 0.7321646341463415, + "grad_norm": 0.09922579124243416, + "kl": 0.0595703125, + "learning_rate": 6.091779806566605e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4803 + }, + { + "completion_length": 1673.6666870117188, + "epoch": 0.7323170731707317, + "grad_norm": 0.41496054662927845, + "kl": 0.0584716796875, + "learning_rate": 6.085359379365787e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4804 + }, + { + "completion_length": 2063.1666870117188, + "epoch": 0.7324695121951219, + "grad_norm": 0.3110185884123818, + "kl": 0.092041015625, + "learning_rate": 6.078941476290086e-07, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4805 + }, + { + "completion_length": 1754.6666717529297, + "epoch": 0.7326219512195122, + "grad_norm": 2.2128796758547664, + "kl": 0.090087890625, + "learning_rate": 6.07252609915668e-07, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4806 + }, + { + "completion_length": 1607.3333435058594, + "epoch": 0.7327743902439025, + "grad_norm": 0.24690077252621695, + "kl": 0.0655517578125, + "learning_rate": 6.066113249782048e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4807 + }, + { + "completion_length": 2488.3333740234375, + "epoch": 0.7329268292682927, + "grad_norm": 0.14681589971270131, + "kl": 0.0714111328125, + "learning_rate": 6.059702929981952e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4808 + }, + { + "completion_length": 1613.5000610351562, + "epoch": 0.7330792682926829, + "grad_norm": 0.14268719965377888, + "kl": 0.089111328125, + "learning_rate": 6.053295141571432e-07, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4809 + }, + { + "completion_length": 1127.0000305175781, + "epoch": 0.7332317073170732, + "grad_norm": 0.09865460241226967, + "kl": 0.0526123046875, + "learning_rate": 6.046889886364817e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4810 + }, + { + "completion_length": 2749.3333740234375, + "epoch": 0.7333841463414634, + "grad_norm": 1.4980412833351293, + "kl": 0.058837890625, + "learning_rate": 6.040487166175707e-07, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4811 + }, + { + "completion_length": 1351.8333740234375, + "epoch": 0.7335365853658536, + "grad_norm": 1.9194844819831942, + "kl": 0.067626953125, + "learning_rate": 6.034086982816998e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4812 + }, + { + "completion_length": 2531.166748046875, + "epoch": 0.7336890243902439, + "grad_norm": 0.14187434910994667, + "kl": 0.06005859375, + "learning_rate": 6.02768933810086e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4813 + }, + { + "completion_length": 1008.8333740234375, + "epoch": 0.7338414634146342, + "grad_norm": 0.1057651970818213, + "kl": 0.0555419921875, + "learning_rate": 6.021294233838754e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4814 + }, + { + "completion_length": 3137.3333740234375, + "epoch": 0.7339939024390244, + "grad_norm": 0.043923783316162354, + "kl": 0.0526123046875, + "learning_rate": 6.0149016718414e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4815 + }, + { + "completion_length": 2977.166748046875, + "epoch": 0.7341463414634146, + "grad_norm": 0.11920576367631103, + "kl": 0.051025390625, + "learning_rate": 6.008511653918821e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4816 + }, + { + "completion_length": 1174.5000610351562, + "epoch": 0.7342987804878048, + "grad_norm": 0.09936607995951408, + "kl": 0.08056640625, + "learning_rate": 6.002124181880306e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4817 + }, + { + "completion_length": 2049.0, + "epoch": 0.7344512195121952, + "grad_norm": 0.91219780891809, + "kl": 0.0665283203125, + "learning_rate": 5.995739257534441e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4818 + }, + { + "completion_length": 1501.1666870117188, + "epoch": 0.7346036585365854, + "grad_norm": 0.1846591160336009, + "kl": 0.050048828125, + "learning_rate": 5.989356882689062e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4819 + }, + { + "completion_length": 2033.5000610351562, + "epoch": 0.7347560975609756, + "grad_norm": 0.1691995536236787, + "kl": 0.0654296875, + "learning_rate": 5.982977059151307e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4820 + }, + { + "completion_length": 1118.1667175292969, + "epoch": 0.7349085365853658, + "grad_norm": 0.09839382634739616, + "kl": 0.058837890625, + "learning_rate": 5.976599788727581e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4821 + }, + { + "completion_length": 3503.0, + "epoch": 0.7350609756097561, + "grad_norm": 0.04492227828038963, + "kl": 0.045166015625, + "learning_rate": 5.970225073223569e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4822 + }, + { + "completion_length": 2286.0, + "epoch": 0.7352134146341464, + "grad_norm": 0.0911894409034868, + "kl": 0.0535888671875, + "learning_rate": 5.963852914444238e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4823 + }, + { + "completion_length": 1374.5, + "epoch": 0.7353658536585366, + "grad_norm": 0.10585796013887147, + "kl": 0.0657958984375, + "learning_rate": 5.957483314193813e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4824 + }, + { + "completion_length": 1417.0, + "epoch": 0.7355182926829268, + "grad_norm": 0.10749282811510112, + "kl": 0.068115234375, + "learning_rate": 5.951116274275813e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4825 + }, + { + "completion_length": 1958.8334350585938, + "epoch": 0.7356707317073171, + "grad_norm": 0.06818969227349213, + "kl": 0.064208984375, + "learning_rate": 5.944751796493026e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4826 + }, + { + "completion_length": 3350.666748046875, + "epoch": 0.7358231707317073, + "grad_norm": 0.05321398993721897, + "kl": 0.053466796875, + "learning_rate": 5.938389882647521e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4827 + }, + { + "completion_length": 3624.166748046875, + "epoch": 0.7359756097560975, + "grad_norm": 0.040112996050982966, + "kl": 0.04150390625, + "learning_rate": 5.93203053454062e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4828 + }, + { + "completion_length": 1146.6667175292969, + "epoch": 0.7361280487804878, + "grad_norm": 0.1306476707769683, + "kl": 0.06982421875, + "learning_rate": 5.925673753972943e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4829 + }, + { + "completion_length": 2401.5000610351562, + "epoch": 0.7362804878048781, + "grad_norm": 0.07776129290098382, + "kl": 0.054443359375, + "learning_rate": 5.91931954274437e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4830 + }, + { + "completion_length": 674.1666870117188, + "epoch": 0.7364329268292683, + "grad_norm": 0.10697051561650911, + "kl": 0.0511474609375, + "learning_rate": 5.912967902654056e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4831 + }, + { + "completion_length": 3277.0001220703125, + "epoch": 0.7365853658536585, + "grad_norm": 0.04428577860090565, + "kl": 0.046630859375, + "learning_rate": 5.906618835500434e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4832 + }, + { + "completion_length": 2494.6666870117188, + "epoch": 0.7367378048780487, + "grad_norm": 0.05710951753380406, + "kl": 0.046142578125, + "learning_rate": 5.900272343081195e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4833 + }, + { + "completion_length": 1475.8333740234375, + "epoch": 0.7368902439024391, + "grad_norm": 2.1588995425387028, + "kl": 0.0657958984375, + "learning_rate": 5.893928427193311e-07, + "loss": 0.0026, + "reward": 0.6666666865348816, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 4834 + }, + { + "completion_length": 1583.6666870117188, + "epoch": 0.7370426829268293, + "grad_norm": 0.09457262683580522, + "kl": 0.0509033203125, + "learning_rate": 5.887587089633018e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4835 + }, + { + "completion_length": 1004.1666870117188, + "epoch": 0.7371951219512195, + "grad_norm": 0.14184274504872624, + "kl": 0.0574951171875, + "learning_rate": 5.881248332195842e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4836 + }, + { + "completion_length": 2278.166748046875, + "epoch": 0.7373475609756097, + "grad_norm": 1.8668388186506804, + "kl": 0.0673828125, + "learning_rate": 5.874912156676549e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4837 + }, + { + "completion_length": 1978.3333740234375, + "epoch": 0.7375, + "grad_norm": 0.08192312056900009, + "kl": 0.060791015625, + "learning_rate": 5.868578564869191e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4838 + }, + { + "completion_length": 1981.6666870117188, + "epoch": 0.7376524390243903, + "grad_norm": 0.09531926434662023, + "kl": 0.052490234375, + "learning_rate": 5.862247558567083e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4839 + }, + { + "completion_length": 2565.0, + "epoch": 0.7378048780487805, + "grad_norm": 0.06290839464406417, + "kl": 0.049072265625, + "learning_rate": 5.855919139562815e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4840 + }, + { + "completion_length": 3412.8333740234375, + "epoch": 0.7379573170731707, + "grad_norm": 0.0440216798792427, + "kl": 0.041259765625, + "learning_rate": 5.849593309648244e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4841 + }, + { + "completion_length": 1229.5000610351562, + "epoch": 0.738109756097561, + "grad_norm": 0.11747618368398191, + "kl": 0.0648193359375, + "learning_rate": 5.843270070614475e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4842 + }, + { + "completion_length": 2714.666748046875, + "epoch": 0.7382621951219512, + "grad_norm": 0.05154356895759733, + "kl": 0.05126953125, + "learning_rate": 5.836949424251901e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4843 + }, + { + "completion_length": 1238.0, + "epoch": 0.7384146341463415, + "grad_norm": 0.08879534741853054, + "kl": 0.05322265625, + "learning_rate": 5.830631372350176e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4844 + }, + { + "completion_length": 3268.8333740234375, + "epoch": 0.7385670731707317, + "grad_norm": 0.04880929000728051, + "kl": 0.0526123046875, + "learning_rate": 5.82431591669822e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4845 + }, + { + "completion_length": 3414.166748046875, + "epoch": 0.738719512195122, + "grad_norm": 0.048191694090071714, + "kl": 0.0396728515625, + "learning_rate": 5.818003059084205e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4846 + }, + { + "completion_length": 1321.0000610351562, + "epoch": 0.7388719512195122, + "grad_norm": 0.07679107870115921, + "kl": 0.04266357421875, + "learning_rate": 5.811692801295583e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4847 + }, + { + "completion_length": 2904.0001220703125, + "epoch": 0.7390243902439024, + "grad_norm": 1.7345090480776542, + "kl": 0.0457763671875, + "learning_rate": 5.805385145119064e-07, + "loss": 0.0018, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 4848 + }, + { + "completion_length": 1411.5000305175781, + "epoch": 0.7391768292682926, + "grad_norm": 0.13906724711537763, + "kl": 0.0634765625, + "learning_rate": 5.799080092340621e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4849 + }, + { + "completion_length": 1904.0000915527344, + "epoch": 0.739329268292683, + "grad_norm": 0.09394941195731424, + "kl": 0.0579833984375, + "learning_rate": 5.792777644745498e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4850 + }, + { + "completion_length": 1603.3333740234375, + "epoch": 0.7394817073170732, + "grad_norm": 1.698754278102346, + "kl": 0.07080078125, + "learning_rate": 5.786477804118182e-07, + "loss": 0.0028, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 4851 + }, + { + "completion_length": 2066.6666870117188, + "epoch": 0.7396341463414634, + "grad_norm": 1.2837928166084867, + "kl": 0.05615234375, + "learning_rate": 5.780180572242438e-07, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4852 + }, + { + "completion_length": 1394.3333740234375, + "epoch": 0.7397865853658536, + "grad_norm": 0.12599308929764172, + "kl": 0.0760498046875, + "learning_rate": 5.773885950901289e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4853 + }, + { + "completion_length": 1469.6666870117188, + "epoch": 0.739939024390244, + "grad_norm": 0.10950443267300324, + "kl": 0.052734375, + "learning_rate": 5.767593941877019e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4854 + }, + { + "completion_length": 1298.3333740234375, + "epoch": 0.7400914634146342, + "grad_norm": 0.10182153085916928, + "kl": 0.05859375, + "learning_rate": 5.76130454695117e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4855 + }, + { + "completion_length": 1096.1666870117188, + "epoch": 0.7402439024390244, + "grad_norm": 0.15105708233991927, + "kl": 0.079833984375, + "learning_rate": 5.755017767904543e-07, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4856 + }, + { + "completion_length": 2732.0000610351562, + "epoch": 0.7403963414634146, + "grad_norm": 0.05565316619106226, + "kl": 0.0540771484375, + "learning_rate": 5.748733606517205e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4857 + }, + { + "completion_length": 1688.166748046875, + "epoch": 0.7405487804878049, + "grad_norm": 0.31703321012953006, + "kl": 0.078857421875, + "learning_rate": 5.74245206456848e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4858 + }, + { + "completion_length": 2550.166748046875, + "epoch": 0.7407012195121951, + "grad_norm": 0.17994338173669747, + "kl": 0.0736083984375, + "learning_rate": 5.736173143836938e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4859 + }, + { + "completion_length": 2196.666748046875, + "epoch": 0.7408536585365854, + "grad_norm": 0.08855503871208861, + "kl": 0.066162109375, + "learning_rate": 5.729896846100419e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4860 + }, + { + "completion_length": 2372.3333740234375, + "epoch": 0.7410060975609756, + "grad_norm": 0.06303329753016547, + "kl": 0.051513671875, + "learning_rate": 5.723623173136022e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4861 + }, + { + "completion_length": 1839.1666870117188, + "epoch": 0.7411585365853659, + "grad_norm": 0.21972195901765187, + "kl": 0.063720703125, + "learning_rate": 5.717352126720096e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4862 + }, + { + "completion_length": 1948.3333740234375, + "epoch": 0.7413109756097561, + "grad_norm": 0.11153863634115961, + "kl": 0.05712890625, + "learning_rate": 5.711083708628253e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4863 + }, + { + "completion_length": 1300.8333740234375, + "epoch": 0.7414634146341463, + "grad_norm": 1.4211811477349316, + "kl": 0.066162109375, + "learning_rate": 5.704817920635348e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4864 + }, + { + "completion_length": 1992.8333740234375, + "epoch": 0.7416158536585366, + "grad_norm": 0.12945013472523048, + "kl": 0.067626953125, + "learning_rate": 5.698554764515504e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4865 + }, + { + "completion_length": 871.1666870117188, + "epoch": 0.7417682926829269, + "grad_norm": 0.18312208561916146, + "kl": 0.0555419921875, + "learning_rate": 5.692294242042096e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4866 + }, + { + "completion_length": 741.5000305175781, + "epoch": 0.7419207317073171, + "grad_norm": 0.08619141761899834, + "kl": 0.0587158203125, + "learning_rate": 5.686036354987752e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4867 + }, + { + "completion_length": 960.3333740234375, + "epoch": 0.7420731707317073, + "grad_norm": 0.11152640116404423, + "kl": 0.0611572265625, + "learning_rate": 5.679781105124357e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4868 + }, + { + "completion_length": 1840.0001220703125, + "epoch": 0.7422256097560975, + "grad_norm": 0.0952630218304088, + "kl": 0.070556640625, + "learning_rate": 5.67352849422304e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4869 + }, + { + "completion_length": 1412.3333740234375, + "epoch": 0.7423780487804879, + "grad_norm": 0.07249025205142061, + "kl": 0.0491943359375, + "learning_rate": 5.667278524054187e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4870 + }, + { + "completion_length": 702.1666870117188, + "epoch": 0.7425304878048781, + "grad_norm": 0.15687533284056446, + "kl": 0.0501708984375, + "learning_rate": 5.661031196387446e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4871 + }, + { + "completion_length": 939.1666870117188, + "epoch": 0.7426829268292683, + "grad_norm": 0.10127801257562613, + "kl": 0.057861328125, + "learning_rate": 5.654786512991705e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4872 + }, + { + "completion_length": 1392.3333740234375, + "epoch": 0.7428353658536585, + "grad_norm": 0.07467046587615937, + "kl": 0.04595947265625, + "learning_rate": 5.648544475635107e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4873 + }, + { + "completion_length": 1488.3333740234375, + "epoch": 0.7429878048780488, + "grad_norm": 0.1139168100018424, + "kl": 0.06982421875, + "learning_rate": 5.642305086085048e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4874 + }, + { + "completion_length": 1697.3334350585938, + "epoch": 0.743140243902439, + "grad_norm": 0.14492653701173722, + "kl": 0.067138671875, + "learning_rate": 5.636068346108172e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4875 + }, + { + "completion_length": 1853.6666870117188, + "epoch": 0.7432926829268293, + "grad_norm": 0.1157005267661009, + "kl": 0.063232421875, + "learning_rate": 5.629834257470377e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4876 + }, + { + "completion_length": 1297.6667175292969, + "epoch": 0.7434451219512195, + "grad_norm": 0.10538083702555177, + "kl": 0.07861328125, + "learning_rate": 5.623602821936797e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4877 + }, + { + "completion_length": 1564.8333740234375, + "epoch": 0.7435975609756098, + "grad_norm": 0.13675404109990233, + "kl": 0.072998046875, + "learning_rate": 5.617374041271828e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4878 + }, + { + "completion_length": 709.6666870117188, + "epoch": 0.74375, + "grad_norm": 0.19586435076221825, + "kl": 0.0703125, + "learning_rate": 5.611147917239114e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4879 + }, + { + "completion_length": 1239.8333740234375, + "epoch": 0.7439024390243902, + "grad_norm": 0.10369430748163995, + "kl": 0.0538330078125, + "learning_rate": 5.60492445160154e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4880 + }, + { + "completion_length": 1438.3333740234375, + "epoch": 0.7440548780487805, + "grad_norm": 0.35924402161845914, + "kl": 0.0777587890625, + "learning_rate": 5.598703646121248e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4881 + }, + { + "completion_length": 1840.1666870117188, + "epoch": 0.7442073170731708, + "grad_norm": 0.08943751292190996, + "kl": 0.05322265625, + "learning_rate": 5.592485502559611e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4882 + }, + { + "completion_length": 675.5, + "epoch": 0.744359756097561, + "grad_norm": 0.24650893951695618, + "kl": 0.07763671875, + "learning_rate": 5.586270022677261e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4883 + }, + { + "completion_length": 651.8333435058594, + "epoch": 0.7445121951219512, + "grad_norm": 0.16424282543781304, + "kl": 0.0615234375, + "learning_rate": 5.580057208234074e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4884 + }, + { + "completion_length": 1104.6666870117188, + "epoch": 0.7446646341463414, + "grad_norm": 2.0062833114522047, + "kl": 0.0858154296875, + "learning_rate": 5.573847060989169e-07, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4885 + }, + { + "completion_length": 1683.0, + "epoch": 0.7448170731707318, + "grad_norm": 0.9743004018957834, + "kl": 0.0594482421875, + "learning_rate": 5.567639582700916e-07, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4886 + }, + { + "completion_length": 1233.3333740234375, + "epoch": 0.744969512195122, + "grad_norm": 0.10107736296065818, + "kl": 0.0550537109375, + "learning_rate": 5.561434775126915e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4887 + }, + { + "completion_length": 3073.666748046875, + "epoch": 0.7451219512195122, + "grad_norm": 0.15768104667231148, + "kl": 0.0595703125, + "learning_rate": 5.555232640024021e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4888 + }, + { + "completion_length": 1535.0000610351562, + "epoch": 0.7452743902439024, + "grad_norm": 0.08315326253896163, + "kl": 0.0582275390625, + "learning_rate": 5.549033179148332e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4889 + }, + { + "completion_length": 2074.3333740234375, + "epoch": 0.7454268292682927, + "grad_norm": 0.07768596184329297, + "kl": 0.0595703125, + "learning_rate": 5.542836394255193e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4890 + }, + { + "completion_length": 812.8333435058594, + "epoch": 0.745579268292683, + "grad_norm": 0.12191715677336036, + "kl": 0.072998046875, + "learning_rate": 5.536642287099167e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4891 + }, + { + "completion_length": 1516.0000610351562, + "epoch": 0.7457317073170732, + "grad_norm": 0.08097789437231104, + "kl": 0.0621337890625, + "learning_rate": 5.530450859434092e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4892 + }, + { + "completion_length": 984.5, + "epoch": 0.7458841463414634, + "grad_norm": 0.11017232942139907, + "kl": 0.06884765625, + "learning_rate": 5.524262113013031e-07, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4893 + }, + { + "completion_length": 1384.8333740234375, + "epoch": 0.7460365853658537, + "grad_norm": 0.07903428833777702, + "kl": 0.056884765625, + "learning_rate": 5.518076049588292e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4894 + }, + { + "completion_length": 1936.1666870117188, + "epoch": 0.7461890243902439, + "grad_norm": 0.06990067322910604, + "kl": 0.064453125, + "learning_rate": 5.511892670911409e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4895 + }, + { + "completion_length": 655.5000305175781, + "epoch": 0.7463414634146341, + "grad_norm": 0.5151449165598998, + "kl": 0.094482421875, + "learning_rate": 5.505711978733175e-07, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4896 + }, + { + "completion_length": 768.1666870117188, + "epoch": 0.7464939024390244, + "grad_norm": 0.11418290273438293, + "kl": 0.0789794921875, + "learning_rate": 5.499533974803612e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4897 + }, + { + "completion_length": 1034.5000610351562, + "epoch": 0.7466463414634147, + "grad_norm": 0.17701649739355796, + "kl": 0.066162109375, + "learning_rate": 5.493358660871986e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4898 + }, + { + "completion_length": 1024.6667175292969, + "epoch": 0.7467987804878049, + "grad_norm": 0.0939177018087404, + "kl": 0.074951171875, + "learning_rate": 5.487186038686803e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4899 + }, + { + "completion_length": 2019.1666870117188, + "epoch": 0.7469512195121951, + "grad_norm": 0.09337740551314141, + "kl": 0.074951171875, + "learning_rate": 5.48101610999579e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4900 + }, + { + "completion_length": 867.5, + "epoch": 0.7471036585365853, + "grad_norm": 1.893823329233161, + "kl": 0.09228515625, + "learning_rate": 5.474848876545931e-07, + "loss": 0.0037, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4901 + }, + { + "completion_length": 2463.0001220703125, + "epoch": 0.7472560975609757, + "grad_norm": 0.14635865115174793, + "kl": 0.0543212890625, + "learning_rate": 5.468684340083442e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4902 + }, + { + "completion_length": 2738.8333740234375, + "epoch": 0.7474085365853659, + "grad_norm": 0.10100987997991553, + "kl": 0.064697265625, + "learning_rate": 5.462522502353769e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4903 + }, + { + "completion_length": 1208.5000305175781, + "epoch": 0.7475609756097561, + "grad_norm": 0.11613353374599633, + "kl": 0.0751953125, + "learning_rate": 5.456363365101606e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4904 + }, + { + "completion_length": 1636.0000610351562, + "epoch": 0.7477134146341463, + "grad_norm": 0.11354647128377332, + "kl": 0.070068359375, + "learning_rate": 5.450206930070862e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4905 + }, + { + "completion_length": 1237.6666870117188, + "epoch": 0.7478658536585366, + "grad_norm": 0.10375414555494956, + "kl": 0.0648193359375, + "learning_rate": 5.444053199004703e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4906 + }, + { + "completion_length": 1828.8333740234375, + "epoch": 0.7480182926829269, + "grad_norm": 0.2704830602042267, + "kl": 0.0682373046875, + "learning_rate": 5.437902173645515e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4907 + }, + { + "completion_length": 1492.5000305175781, + "epoch": 0.7481707317073171, + "grad_norm": 0.10638034630636452, + "kl": 0.0557861328125, + "learning_rate": 5.43175385573493e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4908 + }, + { + "completion_length": 2195.5, + "epoch": 0.7483231707317073, + "grad_norm": 0.07963048371264739, + "kl": 0.0650634765625, + "learning_rate": 5.425608247013789e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4909 + }, + { + "completion_length": 1867.8333740234375, + "epoch": 0.7484756097560976, + "grad_norm": 0.08871187204210121, + "kl": 0.0689697265625, + "learning_rate": 5.419465349222199e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4910 + }, + { + "completion_length": 1250.5000610351562, + "epoch": 0.7486280487804878, + "grad_norm": 0.09525141854821904, + "kl": 0.07470703125, + "learning_rate": 5.41332516409948e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4911 + }, + { + "completion_length": 1176.666748046875, + "epoch": 0.748780487804878, + "grad_norm": 0.11809860509093863, + "kl": 0.070556640625, + "learning_rate": 5.407187693384191e-07, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4912 + }, + { + "completion_length": 948.6666870117188, + "epoch": 0.7489329268292683, + "grad_norm": 0.10559159113316222, + "kl": 0.0499267578125, + "learning_rate": 5.401052938814107e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4913 + }, + { + "completion_length": 1147.0000610351562, + "epoch": 0.7490853658536586, + "grad_norm": 0.14192729811730406, + "kl": 0.071044921875, + "learning_rate": 5.394920902126253e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4914 + }, + { + "completion_length": 1673.8333740234375, + "epoch": 0.7492378048780488, + "grad_norm": 0.08483192576491769, + "kl": 0.05859375, + "learning_rate": 5.388791585056876e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4915 + }, + { + "completion_length": 1809.8333740234375, + "epoch": 0.749390243902439, + "grad_norm": 0.10333617491046602, + "kl": 0.07275390625, + "learning_rate": 5.382664989341455e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4916 + }, + { + "completion_length": 3106.666748046875, + "epoch": 0.7495426829268292, + "grad_norm": 0.05498274591908098, + "kl": 0.0491943359375, + "learning_rate": 5.376541116714703e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4917 + }, + { + "completion_length": 2623.8333740234375, + "epoch": 0.7496951219512196, + "grad_norm": 0.07971703163067652, + "kl": 0.0579833984375, + "learning_rate": 5.370419968910543e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4918 + }, + { + "completion_length": 2248.0, + "epoch": 0.7498475609756098, + "grad_norm": 0.07629037550254948, + "kl": 0.0498046875, + "learning_rate": 5.364301547662148e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4919 + }, + { + "completion_length": 1150.5000305175781, + "epoch": 0.75, + "grad_norm": 0.08728846813207215, + "kl": 0.0577392578125, + "learning_rate": 5.358185854701909e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4920 + }, + { + "completion_length": 2552.6666870117188, + "epoch": 0.7501524390243902, + "grad_norm": 0.06614567278262064, + "kl": 0.0521240234375, + "learning_rate": 5.352072891761455e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4921 + }, + { + "completion_length": 1818.0000610351562, + "epoch": 0.7503048780487804, + "grad_norm": 0.08281184158604236, + "kl": 0.060791015625, + "learning_rate": 5.345962660571622e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4922 + }, + { + "completion_length": 1642.0001220703125, + "epoch": 0.7504573170731708, + "grad_norm": 0.8850531868474666, + "kl": 0.090087890625, + "learning_rate": 5.339855162862485e-07, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4923 + }, + { + "completion_length": 1797.8333740234375, + "epoch": 0.750609756097561, + "grad_norm": 0.47822954371598186, + "kl": 0.076416015625, + "learning_rate": 5.33375040036335e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4924 + }, + { + "completion_length": 2732.166748046875, + "epoch": 0.7507621951219512, + "grad_norm": 0.06936872303022648, + "kl": 0.052734375, + "learning_rate": 5.327648374802739e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4925 + }, + { + "completion_length": 3139.8333740234375, + "epoch": 0.7509146341463414, + "grad_norm": 0.06008329370925716, + "kl": 0.0506591796875, + "learning_rate": 5.321549087908409e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4926 + }, + { + "completion_length": 1191.5000610351562, + "epoch": 0.7510670731707317, + "grad_norm": 2.1546489488879383, + "kl": 0.06982421875, + "learning_rate": 5.315452541407321e-07, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4927 + }, + { + "completion_length": 2058.6666870117188, + "epoch": 0.751219512195122, + "grad_norm": 0.0739733347175263, + "kl": 0.0489501953125, + "learning_rate": 5.309358737025682e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4928 + }, + { + "completion_length": 1079.8333740234375, + "epoch": 0.7513719512195122, + "grad_norm": 0.16645801769919466, + "kl": 0.0589599609375, + "learning_rate": 5.30326767648892e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4929 + }, + { + "completion_length": 2487.1666870117188, + "epoch": 0.7515243902439024, + "grad_norm": 0.09156404447298676, + "kl": 0.058837890625, + "learning_rate": 5.297179361521681e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4930 + }, + { + "completion_length": 1561.6666870117188, + "epoch": 0.7516768292682927, + "grad_norm": 0.1254581776177462, + "kl": 0.072021484375, + "learning_rate": 5.291093793847826e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4931 + }, + { + "completion_length": 1685.3333740234375, + "epoch": 0.7518292682926829, + "grad_norm": 0.2667334741373227, + "kl": 0.08740234375, + "learning_rate": 5.285010975190447e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4932 + }, + { + "completion_length": 1704.5000915527344, + "epoch": 0.7519817073170731, + "grad_norm": 0.14772493062675002, + "kl": 0.0701904296875, + "learning_rate": 5.278930907271859e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4933 + }, + { + "completion_length": 891.0000610351562, + "epoch": 0.7521341463414634, + "grad_norm": 1.0851996763373637, + "kl": 0.0477294921875, + "learning_rate": 5.272853591813594e-07, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4934 + }, + { + "completion_length": 943.3333740234375, + "epoch": 0.7522865853658537, + "grad_norm": 0.09391700791028955, + "kl": 0.0428466796875, + "learning_rate": 5.266779030536414e-07, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4935 + }, + { + "completion_length": 1524.3333740234375, + "epoch": 0.7524390243902439, + "grad_norm": 0.10767769434189801, + "kl": 0.0609130859375, + "learning_rate": 5.26070722516028e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4936 + }, + { + "completion_length": 925.0000610351562, + "epoch": 0.7525914634146341, + "grad_norm": 0.14681220222122462, + "kl": 0.077392578125, + "learning_rate": 5.254638177404391e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4937 + }, + { + "completion_length": 1500.8334350585938, + "epoch": 0.7527439024390243, + "grad_norm": 1.5971436966914265, + "kl": 0.0460205078125, + "learning_rate": 5.248571888987163e-07, + "loss": 0.0018, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4938 + }, + { + "completion_length": 2272.0001220703125, + "epoch": 0.7528963414634147, + "grad_norm": 0.08474922186531067, + "kl": 0.057861328125, + "learning_rate": 5.242508361626231e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4939 + }, + { + "completion_length": 1144.0, + "epoch": 0.7530487804878049, + "grad_norm": 0.10368819328643987, + "kl": 0.044189453125, + "learning_rate": 5.236447597038434e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4940 + }, + { + "completion_length": 858.1666870117188, + "epoch": 0.7532012195121951, + "grad_norm": 4.6058977353727, + "kl": 0.18896484375, + "learning_rate": 5.230389596939848e-07, + "loss": 0.0076, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4941 + }, + { + "completion_length": 2393.0000610351562, + "epoch": 0.7533536585365853, + "grad_norm": 0.05706782657208845, + "kl": 0.05078125, + "learning_rate": 5.224334363045755e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4942 + }, + { + "completion_length": 1500.666748046875, + "epoch": 0.7535060975609756, + "grad_norm": 0.29109687324967354, + "kl": 0.0950927734375, + "learning_rate": 5.218281897070658e-07, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4943 + }, + { + "completion_length": 2017.8334350585938, + "epoch": 0.7536585365853659, + "grad_norm": 0.10955447336213361, + "kl": 0.0587158203125, + "learning_rate": 5.21223220072828e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4944 + }, + { + "completion_length": 2038.1666870117188, + "epoch": 0.7538109756097561, + "grad_norm": 0.20748726940311485, + "kl": 0.0694580078125, + "learning_rate": 5.206185275731546e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4945 + }, + { + "completion_length": 1778.0000915527344, + "epoch": 0.7539634146341463, + "grad_norm": 0.07744960865141028, + "kl": 0.0728759765625, + "learning_rate": 5.200141123792605e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4946 + }, + { + "completion_length": 893.6666870117188, + "epoch": 0.7541158536585366, + "grad_norm": 0.07640754355504828, + "kl": 0.0472412109375, + "learning_rate": 5.194099746622829e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4947 + }, + { + "completion_length": 2397.5001220703125, + "epoch": 0.7542682926829268, + "grad_norm": 0.057053018999319444, + "kl": 0.0523681640625, + "learning_rate": 5.188061145932798e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4948 + }, + { + "completion_length": 1863.166748046875, + "epoch": 0.754420731707317, + "grad_norm": 0.09434487003176854, + "kl": 0.066650390625, + "learning_rate": 5.182025323432297e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4949 + }, + { + "completion_length": 1424.8333740234375, + "epoch": 0.7545731707317073, + "grad_norm": 1.5706856083315708, + "kl": 0.0615234375, + "learning_rate": 5.175992280830331e-07, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4950 + }, + { + "completion_length": 3038.666748046875, + "epoch": 0.7547256097560976, + "grad_norm": 0.041400439139051726, + "kl": 0.04248046875, + "learning_rate": 5.169962019835123e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4951 + }, + { + "completion_length": 1152.1666870117188, + "epoch": 0.7548780487804878, + "grad_norm": 0.08685115826938232, + "kl": 0.04632568359375, + "learning_rate": 5.163934542154106e-07, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4952 + }, + { + "completion_length": 1838.666748046875, + "epoch": 0.755030487804878, + "grad_norm": 0.059578760895485014, + "kl": 0.0543212890625, + "learning_rate": 5.157909849493913e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4953 + }, + { + "completion_length": 894.1666870117188, + "epoch": 0.7551829268292682, + "grad_norm": 0.11946389066306237, + "kl": 0.0799560546875, + "learning_rate": 5.151887943560406e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4954 + }, + { + "completion_length": 1653.6667175292969, + "epoch": 0.7553353658536586, + "grad_norm": 0.12251074015308233, + "kl": 0.06787109375, + "learning_rate": 5.145868826058648e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4955 + }, + { + "completion_length": 2199.166748046875, + "epoch": 0.7554878048780488, + "grad_norm": 0.1309793908347183, + "kl": 0.0623779296875, + "learning_rate": 5.139852498692916e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4956 + }, + { + "completion_length": 1489.3333740234375, + "epoch": 0.755640243902439, + "grad_norm": 1.7084768714614307, + "kl": 0.0643310546875, + "learning_rate": 5.133838963166701e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4957 + }, + { + "completion_length": 746.5000305175781, + "epoch": 0.7557926829268292, + "grad_norm": 0.10527312671023531, + "kl": 0.057373046875, + "learning_rate": 5.127828221182688e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4958 + }, + { + "completion_length": 1319.3333435058594, + "epoch": 0.7559451219512195, + "grad_norm": 0.10676099199062758, + "kl": 0.05712890625, + "learning_rate": 5.121820274442784e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4959 + }, + { + "completion_length": 1390.0, + "epoch": 0.7560975609756098, + "grad_norm": 0.09503233031586443, + "kl": 0.0692138671875, + "learning_rate": 5.115815124648103e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4960 + }, + { + "completion_length": 1070.6666870117188, + "epoch": 0.75625, + "grad_norm": 0.10607231925647621, + "kl": 0.070556640625, + "learning_rate": 5.109812773498967e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4961 + }, + { + "completion_length": 1177.1666870117188, + "epoch": 0.7564024390243902, + "grad_norm": 0.20038179292873984, + "kl": 0.0625, + "learning_rate": 5.103813222694909e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4962 + }, + { + "completion_length": 1021.1666870117188, + "epoch": 0.7565548780487805, + "grad_norm": 0.1739604073724202, + "kl": 0.042724609375, + "learning_rate": 5.097816473934655e-07, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4963 + }, + { + "completion_length": 1109.5, + "epoch": 0.7567073170731707, + "grad_norm": 2.374855260294737, + "kl": 0.080078125, + "learning_rate": 5.091822528916151e-07, + "loss": 0.0032, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4964 + }, + { + "completion_length": 1101.5000610351562, + "epoch": 0.756859756097561, + "grad_norm": 0.25486126785274066, + "kl": 0.0650634765625, + "learning_rate": 5.085831389336538e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4965 + }, + { + "completion_length": 1438.666748046875, + "epoch": 0.7570121951219512, + "grad_norm": 0.09336401476183295, + "kl": 0.050537109375, + "learning_rate": 5.07984305689219e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4966 + }, + { + "completion_length": 924.8333435058594, + "epoch": 0.7571646341463415, + "grad_norm": 2.0857950882351464, + "kl": 0.08251953125, + "learning_rate": 5.07385753327865e-07, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4967 + }, + { + "completion_length": 1559.8333740234375, + "epoch": 0.7573170731707317, + "grad_norm": 0.09842325707456238, + "kl": 0.0657958984375, + "learning_rate": 5.067874820190684e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4968 + }, + { + "completion_length": 1004.6666870117188, + "epoch": 0.7574695121951219, + "grad_norm": 0.13834733917306233, + "kl": 0.0645751953125, + "learning_rate": 5.061894919322263e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4969 + }, + { + "completion_length": 1128.6666870117188, + "epoch": 0.7576219512195121, + "grad_norm": 2.1797201950431164, + "kl": 0.08837890625, + "learning_rate": 5.055917832366561e-07, + "loss": 0.0035, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4970 + }, + { + "completion_length": 1305.166748046875, + "epoch": 0.7577743902439025, + "grad_norm": 0.0651126947566105, + "kl": 0.033203125, + "learning_rate": 5.049943561015945e-07, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4971 + }, + { + "completion_length": 982.6666870117188, + "epoch": 0.7579268292682927, + "grad_norm": 0.16490884953245238, + "kl": 0.06005859375, + "learning_rate": 5.043972106961996e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4972 + }, + { + "completion_length": 1512.3333740234375, + "epoch": 0.7580792682926829, + "grad_norm": 0.10741267967492614, + "kl": 0.058837890625, + "learning_rate": 5.038003471895497e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4973 + }, + { + "completion_length": 1541.0, + "epoch": 0.7582317073170731, + "grad_norm": 0.152513032795089, + "kl": 0.064697265625, + "learning_rate": 5.032037657506428e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4974 + }, + { + "completion_length": 1459.1667175292969, + "epoch": 0.7583841463414634, + "grad_norm": 0.09086157981210753, + "kl": 0.060302734375, + "learning_rate": 5.02607466548398e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4975 + }, + { + "completion_length": 1008.1667175292969, + "epoch": 0.7585365853658537, + "grad_norm": 0.10302127839724551, + "kl": 0.0498046875, + "learning_rate": 5.020114497516521e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4976 + }, + { + "completion_length": 1253.6666870117188, + "epoch": 0.7586890243902439, + "grad_norm": 2.002517361412714, + "kl": 0.0621337890625, + "learning_rate": 5.014157155291648e-07, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4977 + }, + { + "completion_length": 1002.3333740234375, + "epoch": 0.7588414634146341, + "grad_norm": 0.1135397905067717, + "kl": 0.07470703125, + "learning_rate": 5.00820264049614e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4978 + }, + { + "completion_length": 1989.5000610351562, + "epoch": 0.7589939024390244, + "grad_norm": 0.06351634872642375, + "kl": 0.052734375, + "learning_rate": 5.002250954815981e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4979 + }, + { + "completion_length": 1355.3333740234375, + "epoch": 0.7591463414634146, + "grad_norm": 1.2290721361288326, + "kl": 0.0498046875, + "learning_rate": 4.996302099936363e-07, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4980 + }, + { + "completion_length": 1818.1667175292969, + "epoch": 0.7592987804878049, + "grad_norm": 0.3785802510984307, + "kl": 0.076171875, + "learning_rate": 4.990356077541654e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4981 + }, + { + "completion_length": 3018.166748046875, + "epoch": 0.7594512195121951, + "grad_norm": 1.262364177589845, + "kl": 0.0528564453125, + "learning_rate": 4.984412889315439e-07, + "loss": 0.0021, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 4982 + }, + { + "completion_length": 2002.5000610351562, + "epoch": 0.7596036585365854, + "grad_norm": 0.08285495265517868, + "kl": 0.0455322265625, + "learning_rate": 4.978472536940495e-07, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4983 + }, + { + "completion_length": 2454.3334350585938, + "epoch": 0.7597560975609756, + "grad_norm": 0.06736627616824022, + "kl": 0.0528564453125, + "learning_rate": 4.972535022098795e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4984 + }, + { + "completion_length": 2698.6666870117188, + "epoch": 0.7599085365853658, + "grad_norm": 0.061629815955892314, + "kl": 0.0528564453125, + "learning_rate": 4.966600346471512e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4985 + }, + { + "completion_length": 1275.3334197998047, + "epoch": 0.760060975609756, + "grad_norm": 0.15762848068454607, + "kl": 0.0491943359375, + "learning_rate": 4.96066851173901e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4986 + }, + { + "completion_length": 2760.0, + "epoch": 0.7602134146341464, + "grad_norm": 0.06482034147769145, + "kl": 0.0460205078125, + "learning_rate": 4.954739519580851e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4987 + }, + { + "completion_length": 2329.6666870117188, + "epoch": 0.7603658536585366, + "grad_norm": 1.0597946691868005, + "kl": 0.0625, + "learning_rate": 4.948813371675798e-07, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4988 + }, + { + "completion_length": 814.6666870117188, + "epoch": 0.7605182926829268, + "grad_norm": 0.07817684544768272, + "kl": 0.042724609375, + "learning_rate": 4.942890069701795e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4989 + }, + { + "completion_length": 2724.0001220703125, + "epoch": 0.760670731707317, + "grad_norm": 0.07129387784477291, + "kl": 0.0533447265625, + "learning_rate": 4.936969615335991e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4990 + }, + { + "completion_length": 970.0000305175781, + "epoch": 0.7608231707317074, + "grad_norm": 0.14900789722155003, + "kl": 0.0867919921875, + "learning_rate": 4.931052010254725e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4991 + }, + { + "completion_length": 1575.8333435058594, + "epoch": 0.7609756097560976, + "grad_norm": 0.2499408730286924, + "kl": 0.056884765625, + "learning_rate": 4.925137256133533e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 4992 + }, + { + "completion_length": 2480.0001220703125, + "epoch": 0.7611280487804878, + "grad_norm": 0.09842350144374658, + "kl": 0.0684814453125, + "learning_rate": 4.919225354647143e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4993 + }, + { + "completion_length": 2420.3334350585938, + "epoch": 0.761280487804878, + "grad_norm": 0.19570567776149306, + "kl": 0.0537109375, + "learning_rate": 4.913316307469464e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4994 + }, + { + "completion_length": 2534.8333740234375, + "epoch": 0.7614329268292683, + "grad_norm": 0.0868338335845961, + "kl": 0.0582275390625, + "learning_rate": 4.907410116273612e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4995 + }, + { + "completion_length": 1895.8334350585938, + "epoch": 0.7615853658536585, + "grad_norm": 1.0278996697791818, + "kl": 0.048095703125, + "learning_rate": 4.901506782731888e-07, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 4996 + }, + { + "completion_length": 2138.8333740234375, + "epoch": 0.7617378048780488, + "grad_norm": 0.09522851415262627, + "kl": 0.0574951171875, + "learning_rate": 4.895606308515784e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4997 + }, + { + "completion_length": 1625.1666870117188, + "epoch": 0.761890243902439, + "grad_norm": 0.11589058140039697, + "kl": 0.069091796875, + "learning_rate": 4.889708695295988e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4998 + }, + { + "completion_length": 1013.6667175292969, + "epoch": 0.7620426829268293, + "grad_norm": 0.09505788352800151, + "kl": 0.051025390625, + "learning_rate": 4.883813944742362e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 4999 + }, + { + "completion_length": 1840.5, + "epoch": 0.7621951219512195, + "grad_norm": 1.7263419550260553, + "kl": 0.067138671875, + "learning_rate": 4.877922058523971e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5000 + }, + { + "completion_length": 2466.3333740234375, + "epoch": 0.7623475609756097, + "grad_norm": 0.05322201115862611, + "kl": 0.0428466796875, + "learning_rate": 4.87203303830907e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5001 + }, + { + "completion_length": 1408.3333435058594, + "epoch": 0.7625, + "grad_norm": 0.08114740867379565, + "kl": 0.0543212890625, + "learning_rate": 4.866146885765096e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5002 + }, + { + "completion_length": 1762.1666870117188, + "epoch": 0.7626524390243903, + "grad_norm": 2.132949464235455, + "kl": 0.072021484375, + "learning_rate": 4.860263602558679e-07, + "loss": 0.0029, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5003 + }, + { + "completion_length": 2262.0001220703125, + "epoch": 0.7628048780487805, + "grad_norm": 0.06709169579900656, + "kl": 0.0633544921875, + "learning_rate": 4.854383190355629e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5004 + }, + { + "completion_length": 1377.6666870117188, + "epoch": 0.7629573170731707, + "grad_norm": 0.09793770414330084, + "kl": 0.06982421875, + "learning_rate": 4.848505650820955e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5005 + }, + { + "completion_length": 1677.8333740234375, + "epoch": 0.7631097560975609, + "grad_norm": 0.08877901109160669, + "kl": 0.047119140625, + "learning_rate": 4.842630985618844e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5006 + }, + { + "completion_length": 2197.8334350585938, + "epoch": 0.7632621951219513, + "grad_norm": 0.09843778352962385, + "kl": 0.05126953125, + "learning_rate": 4.836759196412666e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5007 + }, + { + "completion_length": 1308.6666870117188, + "epoch": 0.7634146341463415, + "grad_norm": 0.06816058960775215, + "kl": 0.045166015625, + "learning_rate": 4.830890284864985e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5008 + }, + { + "completion_length": 1005.0000610351562, + "epoch": 0.7635670731707317, + "grad_norm": 0.13208825537500796, + "kl": 0.0521240234375, + "learning_rate": 4.825024252637547e-07, + "loss": 0.0021, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 5009 + }, + { + "completion_length": 1839.666748046875, + "epoch": 0.7637195121951219, + "grad_norm": 0.08744743545926177, + "kl": 0.0521240234375, + "learning_rate": 4.819161101391281e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5010 + }, + { + "completion_length": 1248.6666870117188, + "epoch": 0.7638719512195122, + "grad_norm": 0.11284990963926951, + "kl": 0.057861328125, + "learning_rate": 4.813300832786308e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5011 + }, + { + "completion_length": 2377.8334350585938, + "epoch": 0.7640243902439025, + "grad_norm": 0.06764211890830302, + "kl": 0.0570068359375, + "learning_rate": 4.807443448481917e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5012 + }, + { + "completion_length": 1965.166748046875, + "epoch": 0.7641768292682927, + "grad_norm": 0.3110352868741807, + "kl": 0.0906982421875, + "learning_rate": 4.801588950136595e-07, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5013 + }, + { + "completion_length": 2883.166748046875, + "epoch": 0.7643292682926829, + "grad_norm": 0.2960759084591826, + "kl": 0.0706787109375, + "learning_rate": 4.795737339408007e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5014 + }, + { + "completion_length": 805.5, + "epoch": 0.7644817073170732, + "grad_norm": 0.09542582810062285, + "kl": 0.0560302734375, + "learning_rate": 4.789888617953003e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5015 + }, + { + "completion_length": 2281.0001220703125, + "epoch": 0.7646341463414634, + "grad_norm": 0.12097859678416849, + "kl": 0.0498046875, + "learning_rate": 4.784042787427605e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5016 + }, + { + "completion_length": 1900.666748046875, + "epoch": 0.7647865853658536, + "grad_norm": 0.8736795407735787, + "kl": 0.0643310546875, + "learning_rate": 4.778199849487028e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5017 + }, + { + "completion_length": 1763.3334350585938, + "epoch": 0.7649390243902439, + "grad_norm": 2.4316189023439367, + "kl": 0.0660400390625, + "learning_rate": 4.772359805785663e-07, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5018 + }, + { + "completion_length": 1414.3333740234375, + "epoch": 0.7650914634146342, + "grad_norm": 0.08345636715311529, + "kl": 0.0439453125, + "learning_rate": 4.7665226579770824e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5019 + }, + { + "completion_length": 2676.666748046875, + "epoch": 0.7652439024390244, + "grad_norm": 0.07475222313610363, + "kl": 0.0648193359375, + "learning_rate": 4.7606884077140373e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5020 + }, + { + "completion_length": 931.5000305175781, + "epoch": 0.7653963414634146, + "grad_norm": 0.09780309940570169, + "kl": 0.07470703125, + "learning_rate": 4.754857056648461e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5021 + }, + { + "completion_length": 3543.5, + "epoch": 0.7655487804878048, + "grad_norm": 0.0791670236612808, + "kl": 0.055908203125, + "learning_rate": 4.7490286064314634e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5022 + }, + { + "completion_length": 1333.1666870117188, + "epoch": 0.7657012195121952, + "grad_norm": 0.105775845733976, + "kl": 0.0587158203125, + "learning_rate": 4.7432030587133345e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5023 + }, + { + "completion_length": 1108.3333740234375, + "epoch": 0.7658536585365854, + "grad_norm": 0.0757486543418006, + "kl": 0.0472412109375, + "learning_rate": 4.7373804151435456e-07, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5024 + }, + { + "completion_length": 1876.3333740234375, + "epoch": 0.7660060975609756, + "grad_norm": 0.05161425002927586, + "kl": 0.0377197265625, + "learning_rate": 4.731560677370733e-07, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5025 + }, + { + "completion_length": 3223.0, + "epoch": 0.7661585365853658, + "grad_norm": 0.05414315935655099, + "kl": 0.04296875, + "learning_rate": 4.7257438470427256e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5026 + }, + { + "completion_length": 2192.0, + "epoch": 0.7663109756097561, + "grad_norm": 0.09050198953722405, + "kl": 0.0626220703125, + "learning_rate": 4.71992992580652e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5027 + }, + { + "completion_length": 1673.6666870117188, + "epoch": 0.7664634146341464, + "grad_norm": 0.1962493318002411, + "kl": 0.0535888671875, + "learning_rate": 4.714118915308296e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5028 + }, + { + "completion_length": 1554.8333740234375, + "epoch": 0.7666158536585366, + "grad_norm": 0.14487979478264293, + "kl": 0.069091796875, + "learning_rate": 4.708310817193404e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5029 + }, + { + "completion_length": 534.1666870117188, + "epoch": 0.7667682926829268, + "grad_norm": 0.13125662421094386, + "kl": 0.059814453125, + "learning_rate": 4.702505633106366e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5030 + }, + { + "completion_length": 2036.5001220703125, + "epoch": 0.7669207317073171, + "grad_norm": 0.07771050631035804, + "kl": 0.05078125, + "learning_rate": 4.696703364690888e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5031 + }, + { + "completion_length": 1801.3334350585938, + "epoch": 0.7670731707317073, + "grad_norm": 0.1051527199097265, + "kl": 0.043701171875, + "learning_rate": 4.6909040135898463e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5032 + }, + { + "completion_length": 1658.1667175292969, + "epoch": 0.7672256097560975, + "grad_norm": 0.08303956773254428, + "kl": 0.05810546875, + "learning_rate": 4.6851075814452944e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5033 + }, + { + "completion_length": 1572.8333740234375, + "epoch": 0.7673780487804878, + "grad_norm": 0.07997049068454505, + "kl": 0.04345703125, + "learning_rate": 4.679314069898446e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5034 + }, + { + "completion_length": 939.1666870117188, + "epoch": 0.7675304878048781, + "grad_norm": 0.2266710002779784, + "kl": 0.046630859375, + "learning_rate": 4.6735234805897044e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5035 + }, + { + "completion_length": 997.5000610351562, + "epoch": 0.7676829268292683, + "grad_norm": 0.08262650626538803, + "kl": 0.05712890625, + "learning_rate": 4.6677358151586393e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5036 + }, + { + "completion_length": 1506.666748046875, + "epoch": 0.7678353658536585, + "grad_norm": 0.10096977559200844, + "kl": 0.0577392578125, + "learning_rate": 4.6619510752439896e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5037 + }, + { + "completion_length": 3191.166748046875, + "epoch": 0.7679878048780487, + "grad_norm": 0.05948027400628853, + "kl": 0.04931640625, + "learning_rate": 4.656169262483673e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5038 + }, + { + "completion_length": 1864.166748046875, + "epoch": 0.7681402439024391, + "grad_norm": 0.21373004035625465, + "kl": 0.065673828125, + "learning_rate": 4.6503903785147684e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5039 + }, + { + "completion_length": 858.0000305175781, + "epoch": 0.7682926829268293, + "grad_norm": 1.490245669823618, + "kl": 0.0599365234375, + "learning_rate": 4.6446144249735345e-07, + "loss": 0.0024, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5040 + }, + { + "completion_length": 2587.166748046875, + "epoch": 0.7684451219512195, + "grad_norm": 0.06773834499601604, + "kl": 0.0540771484375, + "learning_rate": 4.638841403495396e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5041 + }, + { + "completion_length": 1703.5000915527344, + "epoch": 0.7685975609756097, + "grad_norm": 0.10500294668035512, + "kl": 0.0472412109375, + "learning_rate": 4.633071315714953e-07, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5042 + }, + { + "completion_length": 2094.8334350585938, + "epoch": 0.76875, + "grad_norm": 0.0764050083707077, + "kl": 0.048583984375, + "learning_rate": 4.627304163265961e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5043 + }, + { + "completion_length": 2609.8333740234375, + "epoch": 0.7689024390243903, + "grad_norm": 0.10080428903961948, + "kl": 0.06298828125, + "learning_rate": 4.6215399477813553e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5044 + }, + { + "completion_length": 1088.5, + "epoch": 0.7690548780487805, + "grad_norm": 0.08133947332192756, + "kl": 0.0377197265625, + "learning_rate": 4.615778670893241e-07, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5045 + }, + { + "completion_length": 964.0, + "epoch": 0.7692073170731707, + "grad_norm": 0.14794678141398307, + "kl": 0.089111328125, + "learning_rate": 4.6100203342328897e-07, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5046 + }, + { + "completion_length": 1105.6667175292969, + "epoch": 0.769359756097561, + "grad_norm": 0.09824987581456086, + "kl": 0.07568359375, + "learning_rate": 4.604264939430732e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5047 + }, + { + "completion_length": 872.6666870117188, + "epoch": 0.7695121951219512, + "grad_norm": 0.10195497330900938, + "kl": 0.065673828125, + "learning_rate": 4.598512488116376e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5048 + }, + { + "completion_length": 2347.3333740234375, + "epoch": 0.7696646341463415, + "grad_norm": 0.08691979842095511, + "kl": 0.0738525390625, + "learning_rate": 4.592762981918591e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5049 + }, + { + "completion_length": 2197.0000915527344, + "epoch": 0.7698170731707317, + "grad_norm": 0.09296622412975912, + "kl": 0.05517578125, + "learning_rate": 4.5870164224653156e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5050 + }, + { + "completion_length": 1322.3333740234375, + "epoch": 0.769969512195122, + "grad_norm": 0.08937293492934108, + "kl": 0.053466796875, + "learning_rate": 4.581272811383657e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5051 + }, + { + "completion_length": 847.5, + "epoch": 0.7701219512195122, + "grad_norm": 0.10784540911277611, + "kl": 0.06512451171875, + "learning_rate": 4.5755321502998733e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5052 + }, + { + "completion_length": 1616.8334350585938, + "epoch": 0.7702743902439024, + "grad_norm": 1.3677020722121165, + "kl": 0.0487060546875, + "learning_rate": 4.5697944408394017e-07, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5053 + }, + { + "completion_length": 2537.8333740234375, + "epoch": 0.7704268292682926, + "grad_norm": 0.06766493711004919, + "kl": 0.0570068359375, + "learning_rate": 4.5640596846268395e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5054 + }, + { + "completion_length": 1328.8333740234375, + "epoch": 0.770579268292683, + "grad_norm": 0.078759894164069, + "kl": 0.05029296875, + "learning_rate": 4.5583278832859465e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5055 + }, + { + "completion_length": 1320.5000610351562, + "epoch": 0.7707317073170732, + "grad_norm": 0.14190037862998564, + "kl": 0.06396484375, + "learning_rate": 4.552599038439651e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5056 + }, + { + "completion_length": 2014.8333740234375, + "epoch": 0.7708841463414634, + "grad_norm": 0.07375700660695426, + "kl": 0.04052734375, + "learning_rate": 4.546873151710027e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5057 + }, + { + "completion_length": 1353.6667175292969, + "epoch": 0.7710365853658536, + "grad_norm": 0.18440357678171296, + "kl": 0.0621337890625, + "learning_rate": 4.541150224718336e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5058 + }, + { + "completion_length": 1432.3333435058594, + "epoch": 0.771189024390244, + "grad_norm": 0.15257659049946065, + "kl": 0.0606689453125, + "learning_rate": 4.535430259084987e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5059 + }, + { + "completion_length": 1760.8333740234375, + "epoch": 0.7713414634146342, + "grad_norm": 0.10135669463353487, + "kl": 0.073974609375, + "learning_rate": 4.529713256429556e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5060 + }, + { + "completion_length": 2183.666748046875, + "epoch": 0.7714939024390244, + "grad_norm": 0.09693687836908436, + "kl": 0.080322265625, + "learning_rate": 4.523999218370767e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5061 + }, + { + "completion_length": 928.8333435058594, + "epoch": 0.7716463414634146, + "grad_norm": 0.12498189090468764, + "kl": 0.057373046875, + "learning_rate": 4.51828814652652e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5062 + }, + { + "completion_length": 2546.3333740234375, + "epoch": 0.7717987804878049, + "grad_norm": 0.08759531959689484, + "kl": 0.0616455078125, + "learning_rate": 4.51258004251387e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5063 + }, + { + "completion_length": 1296.0000610351562, + "epoch": 0.7719512195121951, + "grad_norm": 0.07976577027220166, + "kl": 0.079345703125, + "learning_rate": 4.506874907949034e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5064 + }, + { + "completion_length": 1644.5, + "epoch": 0.7721036585365854, + "grad_norm": 0.10916616867716186, + "kl": 0.060791015625, + "learning_rate": 4.5011727444473774e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5065 + }, + { + "completion_length": 689.3333587646484, + "epoch": 0.7722560975609756, + "grad_norm": 2.3972419386141994, + "kl": 0.068359375, + "learning_rate": 4.495473553623436e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5066 + }, + { + "completion_length": 1853.3333435058594, + "epoch": 0.7724085365853659, + "grad_norm": 0.1282479394605145, + "kl": 0.058837890625, + "learning_rate": 4.4897773370909027e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5067 + }, + { + "completion_length": 856.5000305175781, + "epoch": 0.7725609756097561, + "grad_norm": 0.11293571791715726, + "kl": 0.0616455078125, + "learning_rate": 4.484084096462623e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5068 + }, + { + "completion_length": 985.8333435058594, + "epoch": 0.7727134146341463, + "grad_norm": 0.10407296566936515, + "kl": 0.0621337890625, + "learning_rate": 4.4783938333506083e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5069 + }, + { + "completion_length": 1701.8333740234375, + "epoch": 0.7728658536585366, + "grad_norm": 0.2762608749622806, + "kl": 0.07666015625, + "learning_rate": 4.472706549366011e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5070 + }, + { + "completion_length": 869.6666870117188, + "epoch": 0.7730182926829269, + "grad_norm": 0.10413445621105374, + "kl": 0.03857421875, + "learning_rate": 4.467022246119158e-07, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5071 + }, + { + "completion_length": 1082.0000610351562, + "epoch": 0.7731707317073171, + "grad_norm": 0.10973545648320546, + "kl": 0.0653076171875, + "learning_rate": 4.461340925219522e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5072 + }, + { + "completion_length": 862.0000305175781, + "epoch": 0.7733231707317073, + "grad_norm": 0.09274142811936778, + "kl": 0.049560546875, + "learning_rate": 4.4556625882757334e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5073 + }, + { + "completion_length": 1262.666748046875, + "epoch": 0.7734756097560975, + "grad_norm": 0.13938139980985417, + "kl": 0.0640869140625, + "learning_rate": 4.4499872368955846e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5074 + }, + { + "completion_length": 1501.8333435058594, + "epoch": 0.7736280487804879, + "grad_norm": 0.24242901494016078, + "kl": 0.0657958984375, + "learning_rate": 4.444314872686e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5075 + }, + { + "completion_length": 1218.0000610351562, + "epoch": 0.7737804878048781, + "grad_norm": 0.11151776798430921, + "kl": 0.0474853515625, + "learning_rate": 4.438645497253088e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5076 + }, + { + "completion_length": 2328.8333740234375, + "epoch": 0.7739329268292683, + "grad_norm": 0.12464209561036348, + "kl": 0.070068359375, + "learning_rate": 4.4329791122021004e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5077 + }, + { + "completion_length": 1044.6667175292969, + "epoch": 0.7740853658536585, + "grad_norm": 1.67773868189826, + "kl": 0.083251953125, + "learning_rate": 4.427315719137426e-07, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5078 + }, + { + "completion_length": 903.5000305175781, + "epoch": 0.7742378048780488, + "grad_norm": 0.1373909933620719, + "kl": 0.07275390625, + "learning_rate": 4.4216553196626256e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5079 + }, + { + "completion_length": 2599.0000610351562, + "epoch": 0.774390243902439, + "grad_norm": 0.06534672824709095, + "kl": 0.059326171875, + "learning_rate": 4.4159979153804064e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5080 + }, + { + "completion_length": 663.8333435058594, + "epoch": 0.7745426829268293, + "grad_norm": 0.19109273603789034, + "kl": 0.052001953125, + "learning_rate": 4.4103435078926263e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5081 + }, + { + "completion_length": 1223.3333740234375, + "epoch": 0.7746951219512195, + "grad_norm": 0.08115304140543864, + "kl": 0.0533447265625, + "learning_rate": 4.4046920988003014e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5082 + }, + { + "completion_length": 1176.1666870117188, + "epoch": 0.7748475609756098, + "grad_norm": 0.11494244526915193, + "kl": 0.072265625, + "learning_rate": 4.3990436897035833e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5083 + }, + { + "completion_length": 1586.666748046875, + "epoch": 0.775, + "grad_norm": 0.07143257541068036, + "kl": 0.0648193359375, + "learning_rate": 4.3933982822017883e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5084 + }, + { + "completion_length": 764.5000305175781, + "epoch": 0.7751524390243902, + "grad_norm": 0.12532446232365002, + "kl": 0.077392578125, + "learning_rate": 4.387755877893379e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5085 + }, + { + "completion_length": 1761.8334350585938, + "epoch": 0.7753048780487805, + "grad_norm": 0.0677049396373609, + "kl": 0.0423583984375, + "learning_rate": 4.382116478375966e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5086 + }, + { + "completion_length": 1866.6666870117188, + "epoch": 0.7754573170731708, + "grad_norm": 0.07632913763659473, + "kl": 0.0596923828125, + "learning_rate": 4.3764800852463165e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5087 + }, + { + "completion_length": 506.3333435058594, + "epoch": 0.775609756097561, + "grad_norm": 0.22657954581527234, + "kl": 0.093994140625, + "learning_rate": 4.3708467001003305e-07, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5088 + }, + { + "completion_length": 1012.5000305175781, + "epoch": 0.7757621951219512, + "grad_norm": 0.10268270719795415, + "kl": 0.070556640625, + "learning_rate": 4.3652163245330687e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5089 + }, + { + "completion_length": 751.8333740234375, + "epoch": 0.7759146341463414, + "grad_norm": 0.10269908422804135, + "kl": 0.065185546875, + "learning_rate": 4.35958896013874e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5090 + }, + { + "completion_length": 1672.5001220703125, + "epoch": 0.7760670731707318, + "grad_norm": 0.07500261404214882, + "kl": 0.0694580078125, + "learning_rate": 4.353964608510696e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5091 + }, + { + "completion_length": 1488.0000610351562, + "epoch": 0.776219512195122, + "grad_norm": 1.4928036438629337, + "kl": 0.0677490234375, + "learning_rate": 4.348343271241441e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5092 + }, + { + "completion_length": 1259.0, + "epoch": 0.7763719512195122, + "grad_norm": 0.11172437122901158, + "kl": 0.0645751953125, + "learning_rate": 4.342724949922614e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5093 + }, + { + "completion_length": 1352.8333435058594, + "epoch": 0.7765243902439024, + "grad_norm": 0.08098326085946546, + "kl": 0.0574951171875, + "learning_rate": 4.337109646145005e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5094 + }, + { + "completion_length": 890.3333435058594, + "epoch": 0.7766768292682927, + "grad_norm": 0.16056227766579576, + "kl": 0.0626220703125, + "learning_rate": 4.3314973614985705e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5095 + }, + { + "completion_length": 1375.8333435058594, + "epoch": 0.776829268292683, + "grad_norm": 0.12331919099642337, + "kl": 0.0582275390625, + "learning_rate": 4.3258880975723777e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5096 + }, + { + "completion_length": 729.5000305175781, + "epoch": 0.7769817073170732, + "grad_norm": 2.5910329196020054, + "kl": 0.06591796875, + "learning_rate": 4.32028185595466e-07, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5097 + }, + { + "completion_length": 1514.166748046875, + "epoch": 0.7771341463414634, + "grad_norm": 0.09189199808858006, + "kl": 0.07373046875, + "learning_rate": 4.3146786382327893e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5098 + }, + { + "completion_length": 1180.166748046875, + "epoch": 0.7772865853658537, + "grad_norm": 0.08805297993050355, + "kl": 0.0596923828125, + "learning_rate": 4.309078445993281e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5099 + }, + { + "completion_length": 875.0000305175781, + "epoch": 0.7774390243902439, + "grad_norm": 0.1276436203047973, + "kl": 0.05615234375, + "learning_rate": 4.3034812808218017e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5100 + }, + { + "completion_length": 1536.666748046875, + "epoch": 0.7775914634146341, + "grad_norm": 0.10528592818942298, + "kl": 0.06591796875, + "learning_rate": 4.297887144303142e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5101 + }, + { + "completion_length": 2213.166748046875, + "epoch": 0.7777439024390244, + "grad_norm": 0.07315384863729502, + "kl": 0.066162109375, + "learning_rate": 4.292296038021254e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5102 + }, + { + "completion_length": 1096.3333740234375, + "epoch": 0.7778963414634147, + "grad_norm": 0.10257465570477162, + "kl": 0.0543212890625, + "learning_rate": 4.286707963559224e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5103 + }, + { + "completion_length": 952.1666870117188, + "epoch": 0.7780487804878049, + "grad_norm": 0.23409789731316788, + "kl": 0.083740234375, + "learning_rate": 4.2811229224992807e-07, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5104 + }, + { + "completion_length": 950.3333740234375, + "epoch": 0.7782012195121951, + "grad_norm": 0.1184377450856312, + "kl": 0.0589599609375, + "learning_rate": 4.2755409164227973e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5105 + }, + { + "completion_length": 1468.8333740234375, + "epoch": 0.7783536585365853, + "grad_norm": 0.10552836157376387, + "kl": 0.0623779296875, + "learning_rate": 4.269961946910277e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5106 + }, + { + "completion_length": 1419.0000305175781, + "epoch": 0.7785060975609757, + "grad_norm": 0.12094820754696847, + "kl": 0.0654296875, + "learning_rate": 4.2643860155413717e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5107 + }, + { + "completion_length": 1198.6666870117188, + "epoch": 0.7786585365853659, + "grad_norm": 0.08100533472090882, + "kl": 0.051513671875, + "learning_rate": 4.258813123894875e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5108 + }, + { + "completion_length": 541.8333435058594, + "epoch": 0.7788109756097561, + "grad_norm": 0.18803380843454628, + "kl": 0.119873046875, + "learning_rate": 4.253243273548719e-07, + "loss": 0.0048, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5109 + }, + { + "completion_length": 1482.3333435058594, + "epoch": 0.7789634146341463, + "grad_norm": 0.2704066094945113, + "kl": 0.0775146484375, + "learning_rate": 4.247676466079964e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5110 + }, + { + "completion_length": 926.1666870117188, + "epoch": 0.7791158536585366, + "grad_norm": 0.10311578145033572, + "kl": 0.0648193359375, + "learning_rate": 4.2421127030648216e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5111 + }, + { + "completion_length": 1266.166748046875, + "epoch": 0.7792682926829269, + "grad_norm": 0.09770607741265318, + "kl": 0.06787109375, + "learning_rate": 4.2365519860786316e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5112 + }, + { + "completion_length": 1601.5, + "epoch": 0.7794207317073171, + "grad_norm": 0.09487649620660965, + "kl": 0.0574951171875, + "learning_rate": 4.2309943166958915e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5113 + }, + { + "completion_length": 1496.3333740234375, + "epoch": 0.7795731707317073, + "grad_norm": 0.1069151873592649, + "kl": 0.06689453125, + "learning_rate": 4.2254396964902054e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5114 + }, + { + "completion_length": 1915.3334350585938, + "epoch": 0.7797256097560976, + "grad_norm": 0.08682297719809022, + "kl": 0.06884765625, + "learning_rate": 4.2198881270343335e-07, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5115 + }, + { + "completion_length": 675.0000305175781, + "epoch": 0.7798780487804878, + "grad_norm": 0.11227622741715595, + "kl": 0.0650634765625, + "learning_rate": 4.2143396099001724e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5116 + }, + { + "completion_length": 2194.666748046875, + "epoch": 0.780030487804878, + "grad_norm": 0.09792573563657163, + "kl": 0.04736328125, + "learning_rate": 4.208794146658745e-07, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5117 + }, + { + "completion_length": 2213.1666870117188, + "epoch": 0.7801829268292683, + "grad_norm": 0.08581045941986318, + "kl": 0.060546875, + "learning_rate": 4.203251738880223e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5118 + }, + { + "completion_length": 1534.8333740234375, + "epoch": 0.7803353658536586, + "grad_norm": 0.08024084100005799, + "kl": 0.075439453125, + "learning_rate": 4.1977123881338943e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5119 + }, + { + "completion_length": 1180.5000305175781, + "epoch": 0.7804878048780488, + "grad_norm": 0.1086846435954953, + "kl": 0.0565185546875, + "learning_rate": 4.192176095988196e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5120 + }, + { + "completion_length": 1259.6666870117188, + "epoch": 0.780640243902439, + "grad_norm": 0.1368483955706034, + "kl": 0.0634765625, + "learning_rate": 4.186642864010695e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5121 + }, + { + "completion_length": 1470.1667175292969, + "epoch": 0.7807926829268292, + "grad_norm": 0.09679460551529948, + "kl": 0.06884765625, + "learning_rate": 4.1811126937680915e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5122 + }, + { + "completion_length": 1396.3333740234375, + "epoch": 0.7809451219512196, + "grad_norm": 0.1100967005814818, + "kl": 0.0606689453125, + "learning_rate": 4.1755855868262226e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5123 + }, + { + "completion_length": 2020.166748046875, + "epoch": 0.7810975609756098, + "grad_norm": 0.05025102267759321, + "kl": 0.0509033203125, + "learning_rate": 4.170061544750048e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5124 + }, + { + "completion_length": 1130.1666870117188, + "epoch": 0.78125, + "grad_norm": 0.17233504523766496, + "kl": 0.083740234375, + "learning_rate": 4.1645405691036676e-07, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5125 + }, + { + "completion_length": 1566.5000610351562, + "epoch": 0.7814024390243902, + "grad_norm": 0.09045733992634178, + "kl": 0.069580078125, + "learning_rate": 4.159022661450313e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5126 + }, + { + "completion_length": 610.0, + "epoch": 0.7815548780487804, + "grad_norm": 1.8307019034956793, + "kl": 0.0635986328125, + "learning_rate": 4.1535078233523484e-07, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5127 + }, + { + "completion_length": 1265.5000457763672, + "epoch": 0.7817073170731708, + "grad_norm": 0.11749417632437592, + "kl": 0.06640625, + "learning_rate": 4.147996056371258e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5128 + }, + { + "completion_length": 1315.8333740234375, + "epoch": 0.781859756097561, + "grad_norm": 0.10647608199713482, + "kl": 0.0648193359375, + "learning_rate": 4.1424873620676695e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5129 + }, + { + "completion_length": 2136.666748046875, + "epoch": 0.7820121951219512, + "grad_norm": 1.5827906999720631, + "kl": 0.0528564453125, + "learning_rate": 4.1369817420013345e-07, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5130 + }, + { + "completion_length": 1069.3333435058594, + "epoch": 0.7821646341463414, + "grad_norm": 0.11722191693293672, + "kl": 0.07080078125, + "learning_rate": 4.1314791977311337e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5131 + }, + { + "completion_length": 1195.5000610351562, + "epoch": 0.7823170731707317, + "grad_norm": 0.07820960208815259, + "kl": 0.05322265625, + "learning_rate": 4.1259797308150816e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5132 + }, + { + "completion_length": 1404.3333740234375, + "epoch": 0.782469512195122, + "grad_norm": 0.08468551805677162, + "kl": 0.062744140625, + "learning_rate": 4.120483342810317e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5133 + }, + { + "completion_length": 885.8333740234375, + "epoch": 0.7826219512195122, + "grad_norm": 0.0951243330697026, + "kl": 0.07568359375, + "learning_rate": 4.114990035273106e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5134 + }, + { + "completion_length": 1639.166748046875, + "epoch": 0.7827743902439024, + "grad_norm": 1.3457477805512508, + "kl": 0.064453125, + "learning_rate": 4.1094998097588475e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5135 + }, + { + "completion_length": 1177.0000610351562, + "epoch": 0.7829268292682927, + "grad_norm": 0.09700257876810188, + "kl": 0.0531005859375, + "learning_rate": 4.1040126678220656e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5136 + }, + { + "completion_length": 2287.0001220703125, + "epoch": 0.7830792682926829, + "grad_norm": 1.8521604774610134, + "kl": 0.068115234375, + "learning_rate": 4.098528611016406e-07, + "loss": 0.0027, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5137 + }, + { + "completion_length": 1034.6666870117188, + "epoch": 0.7832317073170731, + "grad_norm": 0.10769061511800379, + "kl": 0.062255859375, + "learning_rate": 4.093047640894645e-07, + "loss": 0.0025, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 5138 + }, + { + "completion_length": 623.8333740234375, + "epoch": 0.7833841463414634, + "grad_norm": 0.11385826153715534, + "kl": 0.05712890625, + "learning_rate": 4.0875697590086893e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5139 + }, + { + "completion_length": 986.0000610351562, + "epoch": 0.7835365853658537, + "grad_norm": 0.13035473535212863, + "kl": 0.0526123046875, + "learning_rate": 4.0820949669095696e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5140 + }, + { + "completion_length": 946.6667175292969, + "epoch": 0.7836890243902439, + "grad_norm": 0.1501753591571205, + "kl": 0.08154296875, + "learning_rate": 4.07662326614743e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5141 + }, + { + "completion_length": 1382.1666870117188, + "epoch": 0.7838414634146341, + "grad_norm": 0.11119087312075299, + "kl": 0.0587158203125, + "learning_rate": 4.0711546582715544e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5142 + }, + { + "completion_length": 904.0000305175781, + "epoch": 0.7839939024390243, + "grad_norm": 1.8600157257372005, + "kl": 0.067138671875, + "learning_rate": 4.065689144830345e-07, + "loss": 0.0027, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5143 + }, + { + "completion_length": 2146.166748046875, + "epoch": 0.7841463414634147, + "grad_norm": 1.486097649374332, + "kl": 0.0682373046875, + "learning_rate": 4.060226727371327e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5144 + }, + { + "completion_length": 1657.3333740234375, + "epoch": 0.7842987804878049, + "grad_norm": 0.07087098974431114, + "kl": 0.0511474609375, + "learning_rate": 4.0547674074411536e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5145 + }, + { + "completion_length": 1211.5, + "epoch": 0.7844512195121951, + "grad_norm": 0.10576316741550806, + "kl": 0.060302734375, + "learning_rate": 4.04931118658559e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5146 + }, + { + "completion_length": 1031.1666870117188, + "epoch": 0.7846036585365853, + "grad_norm": 0.11049397518691141, + "kl": 0.055908203125, + "learning_rate": 4.0438580663495365e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5147 + }, + { + "completion_length": 1602.8333740234375, + "epoch": 0.7847560975609756, + "grad_norm": 1.5376181626252132, + "kl": 0.0556640625, + "learning_rate": 4.038408048277009e-07, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5148 + }, + { + "completion_length": 2413.3333435058594, + "epoch": 0.7849085365853659, + "grad_norm": 0.08118620195964367, + "kl": 0.051025390625, + "learning_rate": 4.032961133911145e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5149 + }, + { + "completion_length": 1147.6667175292969, + "epoch": 0.7850609756097561, + "grad_norm": 1.872234462123205, + "kl": 0.060302734375, + "learning_rate": 4.027517324794207e-07, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5150 + }, + { + "completion_length": 1577.8333740234375, + "epoch": 0.7852134146341463, + "grad_norm": 0.08778168757150386, + "kl": 0.065673828125, + "learning_rate": 4.022076622467574e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5151 + }, + { + "completion_length": 2248.5000610351562, + "epoch": 0.7853658536585366, + "grad_norm": 0.12393642167338438, + "kl": 0.0621337890625, + "learning_rate": 4.0166390284717475e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5152 + }, + { + "completion_length": 1364.6666870117188, + "epoch": 0.7855182926829268, + "grad_norm": 0.09430204260343952, + "kl": 0.0521240234375, + "learning_rate": 4.011204544346349e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5153 + }, + { + "completion_length": 1441.166748046875, + "epoch": 0.785670731707317, + "grad_norm": 0.1448787143218, + "kl": 0.0401611328125, + "learning_rate": 4.0057731716301235e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5154 + }, + { + "completion_length": 1582.0, + "epoch": 0.7858231707317073, + "grad_norm": 0.0856003037069861, + "kl": 0.068115234375, + "learning_rate": 4.00034491186092e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5155 + }, + { + "completion_length": 1009.5000305175781, + "epoch": 0.7859756097560976, + "grad_norm": 0.10168706170586578, + "kl": 0.057373046875, + "learning_rate": 3.994919766575722e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5156 + }, + { + "completion_length": 1673.5, + "epoch": 0.7861280487804878, + "grad_norm": 0.08477948450201736, + "kl": 0.0638427734375, + "learning_rate": 3.9894977373106263e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5157 + }, + { + "completion_length": 2041.166748046875, + "epoch": 0.786280487804878, + "grad_norm": 0.140787755088375, + "kl": 0.0748291015625, + "learning_rate": 3.9840788256008494e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5158 + }, + { + "completion_length": 1043.1666870117188, + "epoch": 0.7864329268292682, + "grad_norm": 2.011552087812103, + "kl": 0.087646484375, + "learning_rate": 3.9786630329807157e-07, + "loss": 0.0035, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5159 + }, + { + "completion_length": 1380.3333740234375, + "epoch": 0.7865853658536586, + "grad_norm": 0.1317164944643474, + "kl": 0.0572509765625, + "learning_rate": 3.973250360983677e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5160 + }, + { + "completion_length": 1923.3334350585938, + "epoch": 0.7867378048780488, + "grad_norm": 0.26379075799842394, + "kl": 0.0643310546875, + "learning_rate": 3.967840811142297e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5161 + }, + { + "completion_length": 977.5000305175781, + "epoch": 0.786890243902439, + "grad_norm": 0.10027422047493231, + "kl": 0.058349609375, + "learning_rate": 3.962434384988258e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5162 + }, + { + "completion_length": 1750.666748046875, + "epoch": 0.7870426829268292, + "grad_norm": 1.3028701980069428, + "kl": 0.0609130859375, + "learning_rate": 3.957031084052358e-07, + "loss": 0.0024, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5163 + }, + { + "completion_length": 1735.166748046875, + "epoch": 0.7871951219512195, + "grad_norm": 0.16900791473494856, + "kl": 0.057861328125, + "learning_rate": 3.9516309098645e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5164 + }, + { + "completion_length": 1153.0, + "epoch": 0.7873475609756098, + "grad_norm": 0.16068342026558108, + "kl": 0.069580078125, + "learning_rate": 3.9462338639537144e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5165 + }, + { + "completion_length": 1017.5000610351562, + "epoch": 0.7875, + "grad_norm": 0.09140861227239712, + "kl": 0.065185546875, + "learning_rate": 3.9408399478481406e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5166 + }, + { + "completion_length": 1077.3333740234375, + "epoch": 0.7876524390243902, + "grad_norm": 1.3665317546042732, + "kl": 0.08154296875, + "learning_rate": 3.935449163075035e-07, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5167 + }, + { + "completion_length": 1497.3333740234375, + "epoch": 0.7878048780487805, + "grad_norm": 0.1264146770470765, + "kl": 0.0592041015625, + "learning_rate": 3.930061511160762e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5168 + }, + { + "completion_length": 1654.6666870117188, + "epoch": 0.7879573170731707, + "grad_norm": 0.07221924184657123, + "kl": 0.05029296875, + "learning_rate": 3.9246769936308023e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5169 + }, + { + "completion_length": 1300.6666717529297, + "epoch": 0.788109756097561, + "grad_norm": 1.9585655283575523, + "kl": 0.0667724609375, + "learning_rate": 3.919295612009749e-07, + "loss": 0.0027, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5170 + }, + { + "completion_length": 870.5, + "epoch": 0.7882621951219512, + "grad_norm": 0.14017384393261503, + "kl": 0.0665283203125, + "learning_rate": 3.9139173678213123e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5171 + }, + { + "completion_length": 2535.6666870117188, + "epoch": 0.7884146341463415, + "grad_norm": 0.7197760867640935, + "kl": 0.048828125, + "learning_rate": 3.9085422625882983e-07, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5172 + }, + { + "completion_length": 2469.8334350585938, + "epoch": 0.7885670731707317, + "grad_norm": 0.08866831046470128, + "kl": 0.056884765625, + "learning_rate": 3.9031702978326406e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5173 + }, + { + "completion_length": 2375.0001220703125, + "epoch": 0.7887195121951219, + "grad_norm": 0.05741133602839058, + "kl": 0.0478515625, + "learning_rate": 3.897801475075376e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5174 + }, + { + "completion_length": 2821.3334350585938, + "epoch": 0.7888719512195121, + "grad_norm": 0.06474528943443514, + "kl": 0.049560546875, + "learning_rate": 3.8924357958366533e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5175 + }, + { + "completion_length": 3683.166748046875, + "epoch": 0.7890243902439025, + "grad_norm": 0.04613343759441298, + "kl": 0.0465087890625, + "learning_rate": 3.8870732616357364e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5176 + }, + { + "completion_length": 980.1667175292969, + "epoch": 0.7891768292682927, + "grad_norm": 0.5494951642668362, + "kl": 0.0589599609375, + "learning_rate": 3.881713873990985e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5177 + }, + { + "completion_length": 2316.6666870117188, + "epoch": 0.7893292682926829, + "grad_norm": 1.038691685764386, + "kl": 0.05078125, + "learning_rate": 3.876357634419881e-07, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5178 + }, + { + "completion_length": 2254.0000610351562, + "epoch": 0.7894817073170731, + "grad_norm": 0.06359844386770613, + "kl": 0.05908203125, + "learning_rate": 3.871004544439009e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5179 + }, + { + "completion_length": 1310.6667175292969, + "epoch": 0.7896341463414634, + "grad_norm": 0.10478069938458537, + "kl": 0.042724609375, + "learning_rate": 3.865654605564065e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5180 + }, + { + "completion_length": 1229.6666870117188, + "epoch": 0.7897865853658537, + "grad_norm": 0.06321724990981754, + "kl": 0.049072265625, + "learning_rate": 3.860307819309853e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5181 + }, + { + "completion_length": 2821.6666870117188, + "epoch": 0.7899390243902439, + "grad_norm": 0.07172121721008345, + "kl": 0.03955078125, + "learning_rate": 3.8549641871902756e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5182 + }, + { + "completion_length": 2552.666748046875, + "epoch": 0.7900914634146341, + "grad_norm": 0.06409742548631829, + "kl": 0.052734375, + "learning_rate": 3.8496237107183517e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5183 + }, + { + "completion_length": 2331.166748046875, + "epoch": 0.7902439024390244, + "grad_norm": 0.059433696041562775, + "kl": 0.056396484375, + "learning_rate": 3.8442863914062065e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5184 + }, + { + "completion_length": 1684.3333740234375, + "epoch": 0.7903963414634146, + "grad_norm": 0.12379647348947725, + "kl": 0.0611572265625, + "learning_rate": 3.8389522307650674e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5185 + }, + { + "completion_length": 2719.3333740234375, + "epoch": 0.7905487804878049, + "grad_norm": 0.053831302370421, + "kl": 0.0433349609375, + "learning_rate": 3.833621230305269e-07, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5186 + }, + { + "completion_length": 1604.6666870117188, + "epoch": 0.7907012195121951, + "grad_norm": 0.11053792667934868, + "kl": 0.0509033203125, + "learning_rate": 3.828293391536249e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5187 + }, + { + "completion_length": 1467.166748046875, + "epoch": 0.7908536585365854, + "grad_norm": 0.07856896537834551, + "kl": 0.0538330078125, + "learning_rate": 3.822968715966555e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5188 + }, + { + "completion_length": 2129.666748046875, + "epoch": 0.7910060975609756, + "grad_norm": 0.21484475738645753, + "kl": 0.0540771484375, + "learning_rate": 3.8176472051038375e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5189 + }, + { + "completion_length": 2243.5000915527344, + "epoch": 0.7911585365853658, + "grad_norm": 1.0812062858097913, + "kl": 0.0548095703125, + "learning_rate": 3.8123288604548426e-07, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5190 + }, + { + "completion_length": 4019.5, + "epoch": 0.791310975609756, + "grad_norm": 0.04613123544190271, + "kl": 0.0399169921875, + "learning_rate": 3.80701368352543e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5191 + }, + { + "completion_length": 1403.666748046875, + "epoch": 0.7914634146341464, + "grad_norm": 0.12451518883560762, + "kl": 0.083251953125, + "learning_rate": 3.8017016758205597e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5192 + }, + { + "completion_length": 2312.8333740234375, + "epoch": 0.7916158536585366, + "grad_norm": 1.750956620067753, + "kl": 0.080078125, + "learning_rate": 3.7963928388442926e-07, + "loss": 0.0032, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5193 + }, + { + "completion_length": 3404.8333740234375, + "epoch": 0.7917682926829268, + "grad_norm": 0.04425200672285835, + "kl": 0.04931640625, + "learning_rate": 3.791087174099798e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5194 + }, + { + "completion_length": 1434.0, + "epoch": 0.791920731707317, + "grad_norm": 1.6566875526442693, + "kl": 0.0648193359375, + "learning_rate": 3.7857846830893334e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5195 + }, + { + "completion_length": 1883.1666870117188, + "epoch": 0.7920731707317074, + "grad_norm": 0.12254391951192156, + "kl": 0.07421875, + "learning_rate": 3.7804853673142704e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5196 + }, + { + "completion_length": 2100.166748046875, + "epoch": 0.7922256097560976, + "grad_norm": 0.06086471598191463, + "kl": 0.0513916015625, + "learning_rate": 3.775189228275075e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5197 + }, + { + "completion_length": 1791.3333740234375, + "epoch": 0.7923780487804878, + "grad_norm": 0.11477726947400611, + "kl": 0.0618896484375, + "learning_rate": 3.7698962674713193e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5198 + }, + { + "completion_length": 1609.0000610351562, + "epoch": 0.792530487804878, + "grad_norm": 0.30362950470101757, + "kl": 0.0638427734375, + "learning_rate": 3.7646064864016764e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5199 + }, + { + "completion_length": 1313.8333435058594, + "epoch": 0.7926829268292683, + "grad_norm": 0.08590914492729584, + "kl": 0.055908203125, + "learning_rate": 3.759319886563905e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5200 + }, + { + "completion_length": 1830.8334350585938, + "epoch": 0.7928353658536585, + "grad_norm": 0.06612763675308626, + "kl": 0.0406494140625, + "learning_rate": 3.754036469454876e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5201 + }, + { + "completion_length": 3192.0001220703125, + "epoch": 0.7929878048780488, + "grad_norm": 0.778594735351818, + "kl": 0.0465087890625, + "learning_rate": 3.7487562365705583e-07, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5202 + }, + { + "completion_length": 1174.6666717529297, + "epoch": 0.793140243902439, + "grad_norm": 0.14319623011101248, + "kl": 0.044189453125, + "learning_rate": 3.7434791894060227e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5203 + }, + { + "completion_length": 1384.166748046875, + "epoch": 0.7932926829268293, + "grad_norm": 0.08083927979645418, + "kl": 0.0528564453125, + "learning_rate": 3.7382053294554163e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5204 + }, + { + "completion_length": 1107.8333740234375, + "epoch": 0.7934451219512195, + "grad_norm": 0.07429228605475019, + "kl": 0.0421142578125, + "learning_rate": 3.7329346582120135e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5205 + }, + { + "completion_length": 3068.0, + "epoch": 0.7935975609756097, + "grad_norm": 0.6917607302722762, + "kl": 0.0537109375, + "learning_rate": 3.7276671771681696e-07, + "loss": 0.0022, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5206 + }, + { + "completion_length": 2208.1666870117188, + "epoch": 0.79375, + "grad_norm": 0.06352778169390184, + "kl": 0.0533447265625, + "learning_rate": 3.722402887815341e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5207 + }, + { + "completion_length": 3583.666748046875, + "epoch": 0.7939024390243903, + "grad_norm": 0.03882879648700813, + "kl": 0.0413818359375, + "learning_rate": 3.7171417916440714e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5208 + }, + { + "completion_length": 1584.8333740234375, + "epoch": 0.7940548780487805, + "grad_norm": 0.12885671903968463, + "kl": 0.0587158203125, + "learning_rate": 3.711883890144011e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5209 + }, + { + "completion_length": 2254.3333740234375, + "epoch": 0.7942073170731707, + "grad_norm": 0.06007838897743042, + "kl": 0.0557861328125, + "learning_rate": 3.7066291848039024e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5210 + }, + { + "completion_length": 2233.666717529297, + "epoch": 0.7943597560975609, + "grad_norm": 0.0738548152514349, + "kl": 0.0474853515625, + "learning_rate": 3.701377677111584e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5211 + }, + { + "completion_length": 2147.166748046875, + "epoch": 0.7945121951219513, + "grad_norm": 0.0808657907393624, + "kl": 0.0509033203125, + "learning_rate": 3.696129368553989e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5212 + }, + { + "completion_length": 1705.3334350585938, + "epoch": 0.7946646341463415, + "grad_norm": 0.08319569968411139, + "kl": 0.0498046875, + "learning_rate": 3.6908842606171375e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5213 + }, + { + "completion_length": 3225.666748046875, + "epoch": 0.7948170731707317, + "grad_norm": 0.04406628891149872, + "kl": 0.0458984375, + "learning_rate": 3.6856423547861514e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5214 + }, + { + "completion_length": 2176.5000915527344, + "epoch": 0.7949695121951219, + "grad_norm": 0.0742684955314553, + "kl": 0.0411376953125, + "learning_rate": 3.680403652545245e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5215 + }, + { + "completion_length": 1332.8333740234375, + "epoch": 0.7951219512195122, + "grad_norm": 0.08057214846861467, + "kl": 0.0574951171875, + "learning_rate": 3.6751681553777236e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5216 + }, + { + "completion_length": 752.8333740234375, + "epoch": 0.7952743902439025, + "grad_norm": 0.10528121775132321, + "kl": 0.052734375, + "learning_rate": 3.669935864765992e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5217 + }, + { + "completion_length": 1583.8333740234375, + "epoch": 0.7954268292682927, + "grad_norm": 0.21249342848513492, + "kl": 0.084228515625, + "learning_rate": 3.6647067821915294e-07, + "loss": 0.0034, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5218 + }, + { + "completion_length": 1074.6666870117188, + "epoch": 0.7955792682926829, + "grad_norm": 0.12109199802503831, + "kl": 0.0545654296875, + "learning_rate": 3.6594809091349245e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5219 + }, + { + "completion_length": 2608.0000610351562, + "epoch": 0.7957317073170732, + "grad_norm": 0.09218335205847206, + "kl": 0.0538330078125, + "learning_rate": 3.6542582470758496e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5220 + }, + { + "completion_length": 2165.5001220703125, + "epoch": 0.7958841463414634, + "grad_norm": 0.09459453617440267, + "kl": 0.082275390625, + "learning_rate": 3.649038797493073e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5221 + }, + { + "completion_length": 859.5000305175781, + "epoch": 0.7960365853658536, + "grad_norm": 0.12125122354802045, + "kl": 0.0596923828125, + "learning_rate": 3.643822561864442e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5222 + }, + { + "completion_length": 2711.5001220703125, + "epoch": 0.7961890243902439, + "grad_norm": 0.04562577131209763, + "kl": 0.0443115234375, + "learning_rate": 3.6386095416669015e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5223 + }, + { + "completion_length": 1151.8333740234375, + "epoch": 0.7963414634146342, + "grad_norm": 0.19390031937586893, + "kl": 0.067138671875, + "learning_rate": 3.633399738376491e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5224 + }, + { + "completion_length": 1787.666748046875, + "epoch": 0.7964939024390244, + "grad_norm": 0.06123677090695507, + "kl": 0.0472412109375, + "learning_rate": 3.6281931534683396e-07, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5225 + }, + { + "completion_length": 1600.666748046875, + "epoch": 0.7966463414634146, + "grad_norm": 0.06341697900085835, + "kl": 0.048095703125, + "learning_rate": 3.6229897884166467e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5226 + }, + { + "completion_length": 2614.166748046875, + "epoch": 0.7967987804878048, + "grad_norm": 0.09783079041240433, + "kl": 0.06201171875, + "learning_rate": 3.617789644694718e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5227 + }, + { + "completion_length": 2062.3333740234375, + "epoch": 0.7969512195121952, + "grad_norm": 0.07074169175206997, + "kl": 0.0556640625, + "learning_rate": 3.6125927237749416e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5228 + }, + { + "completion_length": 1593.0000915527344, + "epoch": 0.7971036585365854, + "grad_norm": 0.08115306718901864, + "kl": 0.050048828125, + "learning_rate": 3.607399027128795e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5229 + }, + { + "completion_length": 849.6666870117188, + "epoch": 0.7972560975609756, + "grad_norm": 2.519825281268705, + "kl": 0.077392578125, + "learning_rate": 3.602208556226843e-07, + "loss": 0.0031, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 5230 + }, + { + "completion_length": 2239.5000610351562, + "epoch": 0.7974085365853658, + "grad_norm": 0.0665212383099429, + "kl": 0.042724609375, + "learning_rate": 3.597021312538729e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5231 + }, + { + "completion_length": 1647.8333435058594, + "epoch": 0.7975609756097561, + "grad_norm": 0.1159176638417438, + "kl": 0.093994140625, + "learning_rate": 3.5918372975331933e-07, + "loss": 0.0038, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5232 + }, + { + "completion_length": 1935.3334350585938, + "epoch": 0.7977134146341464, + "grad_norm": 1.1584515788660856, + "kl": 0.0902099609375, + "learning_rate": 3.586656512678054e-07, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5233 + }, + { + "completion_length": 1504.6666870117188, + "epoch": 0.7978658536585366, + "grad_norm": 0.0842083843409199, + "kl": 0.0548095703125, + "learning_rate": 3.5814789594402256e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5234 + }, + { + "completion_length": 1978.666748046875, + "epoch": 0.7980182926829268, + "grad_norm": 0.07951977690152026, + "kl": 0.0404052734375, + "learning_rate": 3.57630463928569e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5235 + }, + { + "completion_length": 998.8333740234375, + "epoch": 0.7981707317073171, + "grad_norm": 0.11551934336027644, + "kl": 0.07275390625, + "learning_rate": 3.57113355367953e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5236 + }, + { + "completion_length": 1137.5, + "epoch": 0.7983231707317073, + "grad_norm": 2.55174842054572, + "kl": 0.079345703125, + "learning_rate": 3.565965704085905e-07, + "loss": 0.0032, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 5237 + }, + { + "completion_length": 2135.3333740234375, + "epoch": 0.7984756097560975, + "grad_norm": 0.0900597844823232, + "kl": 0.0408935546875, + "learning_rate": 3.560801091968059e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5238 + }, + { + "completion_length": 2703.166748046875, + "epoch": 0.7986280487804878, + "grad_norm": 1.2311641970537293, + "kl": 0.0531005859375, + "learning_rate": 3.5556397187883257e-07, + "loss": 0.0021, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5239 + }, + { + "completion_length": 1503.5000610351562, + "epoch": 0.7987804878048781, + "grad_norm": 0.08742906470810252, + "kl": 0.068359375, + "learning_rate": 3.5504815860081056e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5240 + }, + { + "completion_length": 843.1667175292969, + "epoch": 0.7989329268292683, + "grad_norm": 0.10546851132691708, + "kl": 0.06298828125, + "learning_rate": 3.545326695087893e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5241 + }, + { + "completion_length": 1645.0, + "epoch": 0.7990853658536585, + "grad_norm": 0.09672206719936347, + "kl": 0.0528564453125, + "learning_rate": 3.5401750474872703e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5242 + }, + { + "completion_length": 2224.166748046875, + "epoch": 0.7992378048780487, + "grad_norm": 0.059325838251117036, + "kl": 0.04345703125, + "learning_rate": 3.5350266446648964e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5243 + }, + { + "completion_length": 950.8333435058594, + "epoch": 0.7993902439024391, + "grad_norm": 0.20962525189862524, + "kl": 0.0589599609375, + "learning_rate": 3.5298814880785015e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5244 + }, + { + "completion_length": 1646.166748046875, + "epoch": 0.7995426829268293, + "grad_norm": 0.11035646378830531, + "kl": 0.0640869140625, + "learning_rate": 3.5247395791849075e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5245 + }, + { + "completion_length": 1269.0000610351562, + "epoch": 0.7996951219512195, + "grad_norm": 1.5775259816747575, + "kl": 0.089599609375, + "learning_rate": 3.519600919440014e-07, + "loss": 0.0036, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5246 + }, + { + "completion_length": 2136.8333740234375, + "epoch": 0.7998475609756097, + "grad_norm": 0.0949738748480484, + "kl": 0.0518798828125, + "learning_rate": 3.514465510298801e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5247 + }, + { + "completion_length": 2571.3333740234375, + "epoch": 0.8, + "grad_norm": 0.07482239159517434, + "kl": 0.056640625, + "learning_rate": 3.5093333532153313e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5248 + }, + { + "completion_length": 1756.1667175292969, + "epoch": 0.8001524390243903, + "grad_norm": 0.07340823756701581, + "kl": 0.0384521484375, + "learning_rate": 3.504204449642736e-07, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5249 + }, + { + "completion_length": 1401.166748046875, + "epoch": 0.8003048780487805, + "grad_norm": 1.106325345219526, + "kl": 0.04248046875, + "learning_rate": 3.499078801033235e-07, + "loss": 0.0017, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5250 + }, + { + "completion_length": 2294.666717529297, + "epoch": 0.8004573170731707, + "grad_norm": 0.08225007798308187, + "kl": 0.044189453125, + "learning_rate": 3.4939564088381266e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5251 + }, + { + "completion_length": 1884.3333740234375, + "epoch": 0.800609756097561, + "grad_norm": 0.1024354111467981, + "kl": 0.0555419921875, + "learning_rate": 3.4888372745077845e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5252 + }, + { + "completion_length": 2160.5000610351562, + "epoch": 0.8007621951219512, + "grad_norm": 0.10363218347727984, + "kl": 0.0548095703125, + "learning_rate": 3.4837213994916545e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5253 + }, + { + "completion_length": 1884.666748046875, + "epoch": 0.8009146341463415, + "grad_norm": 0.07435981027712288, + "kl": 0.0533447265625, + "learning_rate": 3.4786087852382693e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5254 + }, + { + "completion_length": 3185.666748046875, + "epoch": 0.8010670731707317, + "grad_norm": 0.04659095445650954, + "kl": 0.050537109375, + "learning_rate": 3.4734994331952306e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5255 + }, + { + "completion_length": 2311.0000610351562, + "epoch": 0.801219512195122, + "grad_norm": 0.057335202570018146, + "kl": 0.0565185546875, + "learning_rate": 3.468393344809222e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5256 + }, + { + "completion_length": 1889.3334350585938, + "epoch": 0.8013719512195122, + "grad_norm": 0.08437425154488104, + "kl": 0.055419921875, + "learning_rate": 3.463290521526006e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5257 + }, + { + "completion_length": 2153.666748046875, + "epoch": 0.8015243902439024, + "grad_norm": 0.08256347046036695, + "kl": 0.051513671875, + "learning_rate": 3.458190964790405e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5258 + }, + { + "completion_length": 1930.5001220703125, + "epoch": 0.8016768292682926, + "grad_norm": 0.27821427894046724, + "kl": 0.058349609375, + "learning_rate": 3.4530946760463307e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5259 + }, + { + "completion_length": 2611.666748046875, + "epoch": 0.801829268292683, + "grad_norm": 0.04575499366616728, + "kl": 0.043701171875, + "learning_rate": 3.448001656736763e-07, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5260 + }, + { + "completion_length": 2486.8333740234375, + "epoch": 0.8019817073170732, + "grad_norm": 0.08324772952912295, + "kl": 0.0699462890625, + "learning_rate": 3.4429119083037715e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5261 + }, + { + "completion_length": 1735.6666870117188, + "epoch": 0.8021341463414634, + "grad_norm": 0.07106334759276604, + "kl": 0.0562744140625, + "learning_rate": 3.4378254321884715e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5262 + }, + { + "completion_length": 1864.5, + "epoch": 0.8022865853658536, + "grad_norm": 0.10881793821252507, + "kl": 0.0496826171875, + "learning_rate": 3.4327422298310735e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5263 + }, + { + "completion_length": 898.5, + "epoch": 0.802439024390244, + "grad_norm": 0.10362202643884465, + "kl": 0.0386962890625, + "learning_rate": 3.4276623026708556e-07, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5264 + }, + { + "completion_length": 865.5000305175781, + "epoch": 0.8025914634146342, + "grad_norm": 0.09424776021696157, + "kl": 0.05419921875, + "learning_rate": 3.4225856521461685e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5265 + }, + { + "completion_length": 1864.6666870117188, + "epoch": 0.8027439024390244, + "grad_norm": 0.08039199789003658, + "kl": 0.0640869140625, + "learning_rate": 3.41751227969443e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5266 + }, + { + "completion_length": 1826.5, + "epoch": 0.8028963414634146, + "grad_norm": 0.052024718288495114, + "kl": 0.04010009765625, + "learning_rate": 3.412442186752135e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5267 + }, + { + "completion_length": 1201.5, + "epoch": 0.8030487804878049, + "grad_norm": 0.07656280858650698, + "kl": 0.0479736328125, + "learning_rate": 3.4073753747548494e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5268 + }, + { + "completion_length": 2233.8333740234375, + "epoch": 0.8032012195121951, + "grad_norm": 0.08404305821477474, + "kl": 0.0548095703125, + "learning_rate": 3.4023118451372096e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5269 + }, + { + "completion_length": 1698.8333740234375, + "epoch": 0.8033536585365854, + "grad_norm": 0.1250607920873082, + "kl": 0.0521240234375, + "learning_rate": 3.397251599332928e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5270 + }, + { + "completion_length": 2695.5001220703125, + "epoch": 0.8035060975609756, + "grad_norm": 0.05130021641135232, + "kl": 0.0504150390625, + "learning_rate": 3.392194638774774e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5271 + }, + { + "completion_length": 926.1667175292969, + "epoch": 0.8036585365853659, + "grad_norm": 0.16503273998835363, + "kl": 0.060791015625, + "learning_rate": 3.3871409648945955e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5272 + }, + { + "completion_length": 1163.8333740234375, + "epoch": 0.8038109756097561, + "grad_norm": 0.20005338357167446, + "kl": 0.0615234375, + "learning_rate": 3.3820905791233106e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5273 + }, + { + "completion_length": 2577.0001220703125, + "epoch": 0.8039634146341463, + "grad_norm": 0.07451793072906207, + "kl": 0.05810546875, + "learning_rate": 3.377043482890906e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5274 + }, + { + "completion_length": 1834.3333740234375, + "epoch": 0.8041158536585366, + "grad_norm": 0.06175848050531483, + "kl": 0.0489501953125, + "learning_rate": 3.371999677626437e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5275 + }, + { + "completion_length": 1923.0, + "epoch": 0.8042682926829269, + "grad_norm": 1.9803887087887933, + "kl": 0.0623779296875, + "learning_rate": 3.3669591647580196e-07, + "loss": 0.0025, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 5276 + }, + { + "completion_length": 2024.0000610351562, + "epoch": 0.8044207317073171, + "grad_norm": 1.363415720297633, + "kl": 0.066162109375, + "learning_rate": 3.3619219457128477e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5277 + }, + { + "completion_length": 1539.3333740234375, + "epoch": 0.8045731707317073, + "grad_norm": 0.08048358121103835, + "kl": 0.0458984375, + "learning_rate": 3.3568880219171723e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5278 + }, + { + "completion_length": 1746.5, + "epoch": 0.8047256097560975, + "grad_norm": 1.451861010455908, + "kl": 0.0621337890625, + "learning_rate": 3.3518573947963333e-07, + "loss": 0.0025, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5279 + }, + { + "completion_length": 3213.3333740234375, + "epoch": 0.8048780487804879, + "grad_norm": 0.0481843478040406, + "kl": 0.0484619140625, + "learning_rate": 3.346830065774706e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5280 + }, + { + "completion_length": 1050.8333740234375, + "epoch": 0.8050304878048781, + "grad_norm": 0.13830434137957565, + "kl": 0.067626953125, + "learning_rate": 3.3418060362757545e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5281 + }, + { + "completion_length": 2840.3333740234375, + "epoch": 0.8051829268292683, + "grad_norm": 0.04683038039492095, + "kl": 0.0447998046875, + "learning_rate": 3.336785307722e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5282 + }, + { + "completion_length": 2525.8333740234375, + "epoch": 0.8053353658536585, + "grad_norm": 0.06585143824505489, + "kl": 0.0623779296875, + "learning_rate": 3.331767881535034e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5283 + }, + { + "completion_length": 1892.3333740234375, + "epoch": 0.8054878048780488, + "grad_norm": 0.23278236813854883, + "kl": 0.054443359375, + "learning_rate": 3.326753759135503e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5284 + }, + { + "completion_length": 1225.5, + "epoch": 0.805640243902439, + "grad_norm": 0.11214228783168396, + "kl": 0.064208984375, + "learning_rate": 3.3217429419431264e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5285 + }, + { + "completion_length": 1830.3333740234375, + "epoch": 0.8057926829268293, + "grad_norm": 0.1275033701729139, + "kl": 0.0484619140625, + "learning_rate": 3.3167354313766884e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5286 + }, + { + "completion_length": 564.8333435058594, + "epoch": 0.8059451219512195, + "grad_norm": 0.13274460235250635, + "kl": 0.03955078125, + "learning_rate": 3.311731228854034e-07, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5287 + }, + { + "completion_length": 1025.3333740234375, + "epoch": 0.8060975609756098, + "grad_norm": 0.1488942759095854, + "kl": 0.067626953125, + "learning_rate": 3.306730335792075e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5288 + }, + { + "completion_length": 1922.0, + "epoch": 0.80625, + "grad_norm": 0.12878182368755625, + "kl": 0.05615234375, + "learning_rate": 3.301732753606776e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5289 + }, + { + "completion_length": 962.5000305175781, + "epoch": 0.8064024390243902, + "grad_norm": 0.10232033658722953, + "kl": 0.05126953125, + "learning_rate": 3.2967384837131743e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5290 + }, + { + "completion_length": 2994.166748046875, + "epoch": 0.8065548780487805, + "grad_norm": 0.06127168890355777, + "kl": 0.0474853515625, + "learning_rate": 3.2917475275253674e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5291 + }, + { + "completion_length": 1660.5, + "epoch": 0.8067073170731708, + "grad_norm": 0.056108383726798584, + "kl": 0.0465087890625, + "learning_rate": 3.286759886456513e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5292 + }, + { + "completion_length": 1263.6666870117188, + "epoch": 0.806859756097561, + "grad_norm": 0.12642228302338365, + "kl": 0.0693359375, + "learning_rate": 3.2817755619188366e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5293 + }, + { + "completion_length": 787.1666870117188, + "epoch": 0.8070121951219512, + "grad_norm": 2.3196727834118036, + "kl": 0.072509765625, + "learning_rate": 3.276794555323608e-07, + "loss": 0.0029, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5294 + }, + { + "completion_length": 1461.5000305175781, + "epoch": 0.8071646341463414, + "grad_norm": 0.07187229144217885, + "kl": 0.041015625, + "learning_rate": 3.271816868081173e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5295 + }, + { + "completion_length": 1272.666748046875, + "epoch": 0.8073170731707318, + "grad_norm": 0.09534976751385103, + "kl": 0.0574951171875, + "learning_rate": 3.266842501600934e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5296 + }, + { + "completion_length": 1450.5000610351562, + "epoch": 0.807469512195122, + "grad_norm": 1.2578154780831328, + "kl": 0.0699462890625, + "learning_rate": 3.261871457291352e-07, + "loss": 0.0028, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.1666666716337204, + "step": 5297 + }, + { + "completion_length": 2261.8333740234375, + "epoch": 0.8076219512195122, + "grad_norm": 0.06820401764146612, + "kl": 0.0533447265625, + "learning_rate": 3.2569037365599454e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5298 + }, + { + "completion_length": 1421.0000610351562, + "epoch": 0.8077743902439024, + "grad_norm": 1.0419057602405704, + "kl": 0.0523681640625, + "learning_rate": 3.2519393408132945e-07, + "loss": 0.0021, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5299 + }, + { + "completion_length": 1598.5000610351562, + "epoch": 0.8079268292682927, + "grad_norm": 1.4404558222895858, + "kl": 0.072509765625, + "learning_rate": 3.2469782714570374e-07, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5300 + }, + { + "completion_length": 680.6666870117188, + "epoch": 0.808079268292683, + "grad_norm": 0.12057499515817623, + "kl": 0.0537109375, + "learning_rate": 3.2420205298958746e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5301 + }, + { + "completion_length": 1412.8333740234375, + "epoch": 0.8082317073170732, + "grad_norm": 0.10837422033623358, + "kl": 0.09228515625, + "learning_rate": 3.23706611753355e-07, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5302 + }, + { + "completion_length": 676.5000305175781, + "epoch": 0.8083841463414634, + "grad_norm": 2.1601837789992215, + "kl": 0.085205078125, + "learning_rate": 3.2321150357728794e-07, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5303 + }, + { + "completion_length": 1754.3333435058594, + "epoch": 0.8085365853658537, + "grad_norm": 1.7728093284178725, + "kl": 0.0560302734375, + "learning_rate": 3.2271672860157324e-07, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5304 + }, + { + "completion_length": 1030.5000305175781, + "epoch": 0.8086890243902439, + "grad_norm": 0.12607164210972377, + "kl": 0.0401611328125, + "learning_rate": 3.222222869663033e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5305 + }, + { + "completion_length": 1353.6666870117188, + "epoch": 0.8088414634146341, + "grad_norm": 0.21777847261890615, + "kl": 0.077880859375, + "learning_rate": 3.2172817881147643e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5306 + }, + { + "completion_length": 2280.5, + "epoch": 0.8089939024390244, + "grad_norm": 0.07022784556847268, + "kl": 0.0567626953125, + "learning_rate": 3.2123440427699556e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5307 + }, + { + "completion_length": 1057.3333740234375, + "epoch": 0.8091463414634147, + "grad_norm": 0.1291041817057472, + "kl": 0.0635986328125, + "learning_rate": 3.207409635026704e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5308 + }, + { + "completion_length": 1608.3333740234375, + "epoch": 0.8092987804878049, + "grad_norm": 0.10165890332025394, + "kl": 0.070556640625, + "learning_rate": 3.2024785662821546e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5309 + }, + { + "completion_length": 1830.3333740234375, + "epoch": 0.8094512195121951, + "grad_norm": 1.545232494290512, + "kl": 0.0665283203125, + "learning_rate": 3.197550837932511e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5310 + }, + { + "completion_length": 1207.8333740234375, + "epoch": 0.8096036585365853, + "grad_norm": 0.08972648504720462, + "kl": 0.0472412109375, + "learning_rate": 3.192626451373032e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5311 + }, + { + "completion_length": 1253.0, + "epoch": 0.8097560975609757, + "grad_norm": 0.1700149007875888, + "kl": 0.0384521484375, + "learning_rate": 3.187705407998018e-07, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5312 + }, + { + "completion_length": 2336.1666870117188, + "epoch": 0.8099085365853659, + "grad_norm": 0.06959641426193075, + "kl": 0.0477294921875, + "learning_rate": 3.182787709200836e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5313 + }, + { + "completion_length": 1349.5000610351562, + "epoch": 0.8100609756097561, + "grad_norm": 0.22591874770644527, + "kl": 0.05126953125, + "learning_rate": 3.177873356373904e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5314 + }, + { + "completion_length": 1746.3333740234375, + "epoch": 0.8102134146341463, + "grad_norm": 0.09754421392895844, + "kl": 0.07080078125, + "learning_rate": 3.1729623509086876e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5315 + }, + { + "completion_length": 1615.6666870117188, + "epoch": 0.8103658536585366, + "grad_norm": 0.07672341793992007, + "kl": 0.04931640625, + "learning_rate": 3.16805469419571e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5316 + }, + { + "completion_length": 1682.6666870117188, + "epoch": 0.8105182926829269, + "grad_norm": 1.5577432811800926, + "kl": 0.0635986328125, + "learning_rate": 3.163150387624542e-07, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5317 + }, + { + "completion_length": 1579.6666870117188, + "epoch": 0.8106707317073171, + "grad_norm": 0.11749444698783595, + "kl": 0.0728759765625, + "learning_rate": 3.158249432583808e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5318 + }, + { + "completion_length": 2697.5001220703125, + "epoch": 0.8108231707317073, + "grad_norm": 0.10325480168748355, + "kl": 0.0704345703125, + "learning_rate": 3.1533518304611873e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5319 + }, + { + "completion_length": 870.8333740234375, + "epoch": 0.8109756097560976, + "grad_norm": 1.4007964405414963, + "kl": 0.092041015625, + "learning_rate": 3.148457582643398e-07, + "loss": 0.0037, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5320 + }, + { + "completion_length": 968.0000610351562, + "epoch": 0.8111280487804878, + "grad_norm": 0.10388972341557266, + "kl": 0.05712890625, + "learning_rate": 3.143566690516218e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5321 + }, + { + "completion_length": 1138.3333435058594, + "epoch": 0.811280487804878, + "grad_norm": 0.20307481735439895, + "kl": 0.0712890625, + "learning_rate": 3.1386791554644763e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5322 + }, + { + "completion_length": 884.1667175292969, + "epoch": 0.8114329268292683, + "grad_norm": 1.974576620567757, + "kl": 0.040283203125, + "learning_rate": 3.133794978872043e-07, + "loss": 0.0016, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5323 + }, + { + "completion_length": 1111.5000610351562, + "epoch": 0.8115853658536586, + "grad_norm": 0.20321479781514817, + "kl": 0.082763671875, + "learning_rate": 3.1289141621218513e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5324 + }, + { + "completion_length": 1429.166748046875, + "epoch": 0.8117378048780488, + "grad_norm": 1.5244104307244313, + "kl": 0.064453125, + "learning_rate": 3.124036706595865e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5325 + }, + { + "completion_length": 1810.3334350585938, + "epoch": 0.811890243902439, + "grad_norm": 0.07432647949070997, + "kl": 0.057373046875, + "learning_rate": 3.1191626136751086e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5326 + }, + { + "completion_length": 1563.8334350585938, + "epoch": 0.8120426829268292, + "grad_norm": 0.23266056772865162, + "kl": 0.0518798828125, + "learning_rate": 3.11429188473965e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5327 + }, + { + "completion_length": 929.1667175292969, + "epoch": 0.8121951219512196, + "grad_norm": 0.10276427986009705, + "kl": 0.0660400390625, + "learning_rate": 3.1094245211686106e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5328 + }, + { + "completion_length": 1425.3333740234375, + "epoch": 0.8123475609756098, + "grad_norm": 0.09757359678260404, + "kl": 0.0712890625, + "learning_rate": 3.1045605243401477e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5329 + }, + { + "completion_length": 998.8333435058594, + "epoch": 0.8125, + "grad_norm": 2.0291822845371366, + "kl": 0.082275390625, + "learning_rate": 3.0996998956314745e-07, + "loss": 0.0033, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5330 + }, + { + "completion_length": 920.3333435058594, + "epoch": 0.8126524390243902, + "grad_norm": 0.09545832951417352, + "kl": 0.041259765625, + "learning_rate": 3.094842636418848e-07, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5331 + }, + { + "completion_length": 856.0000305175781, + "epoch": 0.8128048780487804, + "grad_norm": 0.15275638641817207, + "kl": 0.0606689453125, + "learning_rate": 3.089988748077572e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5332 + }, + { + "completion_length": 1480.166748046875, + "epoch": 0.8129573170731708, + "grad_norm": 1.8822326017695852, + "kl": 0.07861328125, + "learning_rate": 3.085138231981996e-07, + "loss": 0.0032, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5333 + }, + { + "completion_length": 676.1666870117188, + "epoch": 0.813109756097561, + "grad_norm": 2.215436316553822, + "kl": 0.0936279296875, + "learning_rate": 3.080291089505505e-07, + "loss": 0.0037, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5334 + }, + { + "completion_length": 1577.0000610351562, + "epoch": 0.8132621951219512, + "grad_norm": 0.09062338626947204, + "kl": 0.063232421875, + "learning_rate": 3.0754473220205476e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5335 + }, + { + "completion_length": 843.1666870117188, + "epoch": 0.8134146341463414, + "grad_norm": 0.16016821934005418, + "kl": 0.08251953125, + "learning_rate": 3.070606930898602e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5336 + }, + { + "completion_length": 1189.8333740234375, + "epoch": 0.8135670731707317, + "grad_norm": 0.295265700195552, + "kl": 0.091064453125, + "learning_rate": 3.0657699175101983e-07, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5337 + }, + { + "completion_length": 1148.0000457763672, + "epoch": 0.813719512195122, + "grad_norm": 2.6206774006167652, + "kl": 0.08544921875, + "learning_rate": 3.0609362832249015e-07, + "loss": 0.0034, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5338 + }, + { + "completion_length": 1279.6667175292969, + "epoch": 0.8138719512195122, + "grad_norm": 0.11759449186700205, + "kl": 0.060791015625, + "learning_rate": 3.0561060294113276e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5339 + }, + { + "completion_length": 1291.3333740234375, + "epoch": 0.8140243902439024, + "grad_norm": 0.08090989205097643, + "kl": 0.0673828125, + "learning_rate": 3.051279157437132e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5340 + }, + { + "completion_length": 1384.1667175292969, + "epoch": 0.8141768292682927, + "grad_norm": 2.3392574891699645, + "kl": 0.0635986328125, + "learning_rate": 3.0464556686690146e-07, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5341 + }, + { + "completion_length": 1652.166748046875, + "epoch": 0.8143292682926829, + "grad_norm": 0.8876129526793073, + "kl": 0.077880859375, + "learning_rate": 3.041635564472719e-07, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5342 + }, + { + "completion_length": 1178.3333740234375, + "epoch": 0.8144817073170731, + "grad_norm": 0.11061150044817179, + "kl": 0.0526123046875, + "learning_rate": 3.0368188462130225e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5343 + }, + { + "completion_length": 739.8333435058594, + "epoch": 0.8146341463414634, + "grad_norm": 0.11512770065714296, + "kl": 0.066162109375, + "learning_rate": 3.032005515253751e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5344 + }, + { + "completion_length": 1020.8333435058594, + "epoch": 0.8147865853658537, + "grad_norm": 0.1151285672475932, + "kl": 0.0657958984375, + "learning_rate": 3.0271955729577685e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5345 + }, + { + "completion_length": 791.1666870117188, + "epoch": 0.8149390243902439, + "grad_norm": 0.10721232969553914, + "kl": 0.04833984375, + "learning_rate": 3.022389020686986e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5346 + }, + { + "completion_length": 806.1666870117188, + "epoch": 0.8150914634146341, + "grad_norm": 2.316875148905597, + "kl": 0.079345703125, + "learning_rate": 3.0175858598023404e-07, + "loss": 0.0032, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5347 + }, + { + "completion_length": 694.5000305175781, + "epoch": 0.8152439024390243, + "grad_norm": 0.08770332129350068, + "kl": 0.0325927734375, + "learning_rate": 3.0127860916638204e-07, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5348 + }, + { + "completion_length": 931.8333435058594, + "epoch": 0.8153963414634147, + "grad_norm": 0.07925141619484254, + "kl": 0.037353515625, + "learning_rate": 3.007989717630451e-07, + "loss": 0.0015, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5349 + }, + { + "completion_length": 1666.0000610351562, + "epoch": 0.8155487804878049, + "grad_norm": 2.0108551565444803, + "kl": 0.0743408203125, + "learning_rate": 3.0031967390602954e-07, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5350 + }, + { + "completion_length": 1183.1666870117188, + "epoch": 0.8157012195121951, + "grad_norm": 2.4828972819666144, + "kl": 0.07421875, + "learning_rate": 2.99840715731046e-07, + "loss": 0.003, + "reward": 0.6666666865348816, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.6666666865348816, + "rewards/format_reward": 0.0, + "step": 5351 + }, + { + "completion_length": 1378.0000610351562, + "epoch": 0.8158536585365853, + "grad_norm": 0.17886100380793848, + "kl": 0.073974609375, + "learning_rate": 2.9936209737370727e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5352 + }, + { + "completion_length": 815.3333435058594, + "epoch": 0.8160060975609756, + "grad_norm": 0.1375947202709643, + "kl": 0.0771484375, + "learning_rate": 2.9888381896953235e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5353 + }, + { + "completion_length": 1402.5000305175781, + "epoch": 0.8161585365853659, + "grad_norm": 0.09555080974604334, + "kl": 0.050537109375, + "learning_rate": 2.984058806539426e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5354 + }, + { + "completion_length": 1954.5, + "epoch": 0.8163109756097561, + "grad_norm": 0.06409662899159724, + "kl": 0.03814697265625, + "learning_rate": 2.9792828256226337e-07, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5355 + }, + { + "completion_length": 2300.0000915527344, + "epoch": 0.8164634146341463, + "grad_norm": 0.07666832451660623, + "kl": 0.0582275390625, + "learning_rate": 2.97451024829723e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5356 + }, + { + "completion_length": 1308.3333740234375, + "epoch": 0.8166158536585366, + "grad_norm": 0.08140755763839103, + "kl": 0.0419921875, + "learning_rate": 2.969741075914544e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5357 + }, + { + "completion_length": 1855.5000915527344, + "epoch": 0.8167682926829268, + "grad_norm": 0.07930428181766437, + "kl": 0.0506591796875, + "learning_rate": 2.964975309824937e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5358 + }, + { + "completion_length": 2345.8333740234375, + "epoch": 0.816920731707317, + "grad_norm": 0.07666126473557518, + "kl": 0.066162109375, + "learning_rate": 2.9602129513778104e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5359 + }, + { + "completion_length": 875.1666870117188, + "epoch": 0.8170731707317073, + "grad_norm": 0.10853433476519582, + "kl": 0.045166015625, + "learning_rate": 2.955454001921588e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5360 + }, + { + "completion_length": 1621.1667175292969, + "epoch": 0.8172256097560976, + "grad_norm": 0.09954265537580416, + "kl": 0.0516357421875, + "learning_rate": 2.9506984628037424e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5361 + }, + { + "completion_length": 1261.3333740234375, + "epoch": 0.8173780487804878, + "grad_norm": 0.12508891984780915, + "kl": 0.053466796875, + "learning_rate": 2.9459463353707714e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5362 + }, + { + "completion_length": 794.1666870117188, + "epoch": 0.817530487804878, + "grad_norm": 0.13274261274088237, + "kl": 0.06884765625, + "learning_rate": 2.9411976209682156e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5363 + }, + { + "completion_length": 1324.3333740234375, + "epoch": 0.8176829268292682, + "grad_norm": 0.09192710755243913, + "kl": 0.0535888671875, + "learning_rate": 2.9364523209406423e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5364 + }, + { + "completion_length": 773.3333435058594, + "epoch": 0.8178353658536586, + "grad_norm": 0.13784617184967438, + "kl": 0.0753173828125, + "learning_rate": 2.931710436631649e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5365 + }, + { + "completion_length": 1005.1666870117188, + "epoch": 0.8179878048780488, + "grad_norm": 0.12571106504830643, + "kl": 0.0806884765625, + "learning_rate": 2.926971969383874e-07, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5366 + }, + { + "completion_length": 2605.1666870117188, + "epoch": 0.818140243902439, + "grad_norm": 0.07314025564127105, + "kl": 0.060302734375, + "learning_rate": 2.922236920538987e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5367 + }, + { + "completion_length": 1096.1666870117188, + "epoch": 0.8182926829268292, + "grad_norm": 0.09076571195689674, + "kl": 0.0628662109375, + "learning_rate": 2.917505291437683e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5368 + }, + { + "completion_length": 919.8333435058594, + "epoch": 0.8184451219512195, + "grad_norm": 0.0985532587699259, + "kl": 0.05517578125, + "learning_rate": 2.912777083419703e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5369 + }, + { + "completion_length": 994.6666870117188, + "epoch": 0.8185975609756098, + "grad_norm": 0.15375922045857696, + "kl": 0.0732421875, + "learning_rate": 2.908052297823793e-07, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5370 + }, + { + "completion_length": 683.6666870117188, + "epoch": 0.81875, + "grad_norm": 0.1050101806871384, + "kl": 0.0538330078125, + "learning_rate": 2.9033309359877604e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5371 + }, + { + "completion_length": 870.3333740234375, + "epoch": 0.8189024390243902, + "grad_norm": 0.16836566139487513, + "kl": 0.075927734375, + "learning_rate": 2.8986129992484254e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5372 + }, + { + "completion_length": 1977.3333740234375, + "epoch": 0.8190548780487805, + "grad_norm": 1.04162284705328, + "kl": 0.046875, + "learning_rate": 2.893898488941647e-07, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5373 + }, + { + "completion_length": 860.0000305175781, + "epoch": 0.8192073170731707, + "grad_norm": 0.10016612695649155, + "kl": 0.070068359375, + "learning_rate": 2.889187406402302e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5374 + }, + { + "completion_length": 1831.6666870117188, + "epoch": 0.819359756097561, + "grad_norm": 0.10739030958677696, + "kl": 0.0545654296875, + "learning_rate": 2.884479752964306e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5375 + }, + { + "completion_length": 1078.3333740234375, + "epoch": 0.8195121951219512, + "grad_norm": 0.09871101852795519, + "kl": 0.04620361328125, + "learning_rate": 2.879775529960603e-07, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5376 + }, + { + "completion_length": 1433.5, + "epoch": 0.8196646341463415, + "grad_norm": 2.4444375130660863, + "kl": 0.07568359375, + "learning_rate": 2.8750747387231704e-07, + "loss": 0.003, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5377 + }, + { + "completion_length": 1358.8333740234375, + "epoch": 0.8198170731707317, + "grad_norm": 0.104384202154282, + "kl": 0.051513671875, + "learning_rate": 2.870377380582997e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5378 + }, + { + "completion_length": 1148.3333740234375, + "epoch": 0.8199695121951219, + "grad_norm": 1.56690225463482, + "kl": 0.086181640625, + "learning_rate": 2.865683456870116e-07, + "loss": 0.0034, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5379 + }, + { + "completion_length": 1744.0, + "epoch": 0.8201219512195121, + "grad_norm": 0.07225170782868703, + "kl": 0.055908203125, + "learning_rate": 2.8609929689135833e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5380 + }, + { + "completion_length": 1549.6666870117188, + "epoch": 0.8202743902439025, + "grad_norm": 0.09116094439134942, + "kl": 0.0653076171875, + "learning_rate": 2.8563059180414806e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5381 + }, + { + "completion_length": 1258.0, + "epoch": 0.8204268292682927, + "grad_norm": 0.09279903128043453, + "kl": 0.077880859375, + "learning_rate": 2.8516223055809205e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5382 + }, + { + "completion_length": 1985.8333740234375, + "epoch": 0.8205792682926829, + "grad_norm": 0.09921052280781538, + "kl": 0.055419921875, + "learning_rate": 2.846942132858033e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5383 + }, + { + "completion_length": 1516.8333740234375, + "epoch": 0.8207317073170731, + "grad_norm": 0.08422595977084286, + "kl": 0.0604248046875, + "learning_rate": 2.842265401197982e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5384 + }, + { + "completion_length": 720.3333435058594, + "epoch": 0.8208841463414634, + "grad_norm": 0.16831561509658313, + "kl": 0.05615234375, + "learning_rate": 2.837592111924955e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5385 + }, + { + "completion_length": 1178.5, + "epoch": 0.8210365853658537, + "grad_norm": 0.08334416905135757, + "kl": 0.0537109375, + "learning_rate": 2.8329222663621663e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5386 + }, + { + "completion_length": 1450.5000610351562, + "epoch": 0.8211890243902439, + "grad_norm": 0.09182922012129677, + "kl": 0.078125, + "learning_rate": 2.828255865831857e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5387 + }, + { + "completion_length": 2780.3333740234375, + "epoch": 0.8213414634146341, + "grad_norm": 0.8864772547438712, + "kl": 0.051513671875, + "learning_rate": 2.82359291165528e-07, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5388 + }, + { + "completion_length": 1567.0, + "epoch": 0.8214939024390244, + "grad_norm": 0.08214661532082469, + "kl": 0.060302734375, + "learning_rate": 2.818933405152724e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5389 + }, + { + "completion_length": 1384.1666870117188, + "epoch": 0.8216463414634146, + "grad_norm": 0.12049165562822856, + "kl": 0.0567626953125, + "learning_rate": 2.81427734764351e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5390 + }, + { + "completion_length": 1022.6666870117188, + "epoch": 0.8217987804878049, + "grad_norm": 0.15019470255864345, + "kl": 0.0479736328125, + "learning_rate": 2.8096247404459594e-07, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5391 + }, + { + "completion_length": 1183.3333740234375, + "epoch": 0.8219512195121951, + "grad_norm": 0.10372787125603442, + "kl": 0.0638427734375, + "learning_rate": 2.8049755848774337e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5392 + }, + { + "completion_length": 792.0000305175781, + "epoch": 0.8221036585365854, + "grad_norm": 0.12357266021922642, + "kl": 0.0592041015625, + "learning_rate": 2.8003298822543145e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5393 + }, + { + "completion_length": 2002.1666870117188, + "epoch": 0.8222560975609756, + "grad_norm": 1.9598166644580153, + "kl": 0.062744140625, + "learning_rate": 2.7956876338920007e-07, + "loss": 0.0025, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5394 + }, + { + "completion_length": 1176.5000305175781, + "epoch": 0.8224085365853658, + "grad_norm": 0.0888709590239594, + "kl": 0.0631103515625, + "learning_rate": 2.7910488411049194e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5395 + }, + { + "completion_length": 1377.8333740234375, + "epoch": 0.822560975609756, + "grad_norm": 0.30285979976255206, + "kl": 0.05615234375, + "learning_rate": 2.78641350520651e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5396 + }, + { + "completion_length": 795.3333435058594, + "epoch": 0.8227134146341464, + "grad_norm": 2.119327101493166, + "kl": 0.08935546875, + "learning_rate": 2.781781627509243e-07, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5397 + }, + { + "completion_length": 1273.6666870117188, + "epoch": 0.8228658536585366, + "grad_norm": 0.09419405947279921, + "kl": 0.06591796875, + "learning_rate": 2.777153209324606e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5398 + }, + { + "completion_length": 1460.3333740234375, + "epoch": 0.8230182926829268, + "grad_norm": 0.08539062096088605, + "kl": 0.0633544921875, + "learning_rate": 2.772528251963107e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5399 + }, + { + "completion_length": 1982.666748046875, + "epoch": 0.823170731707317, + "grad_norm": 0.0897057413483786, + "kl": 0.06787109375, + "learning_rate": 2.7679067567342766e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5400 + }, + { + "completion_length": 904.5000305175781, + "epoch": 0.8233231707317074, + "grad_norm": 0.10095742288918527, + "kl": 0.06591796875, + "learning_rate": 2.763288724946653e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5401 + }, + { + "completion_length": 1564.6667175292969, + "epoch": 0.8234756097560976, + "grad_norm": 0.11610221665415514, + "kl": 0.0721435546875, + "learning_rate": 2.758674157907811e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5402 + }, + { + "completion_length": 1013.1666870117188, + "epoch": 0.8236280487804878, + "grad_norm": 0.09369414567569041, + "kl": 0.0531005859375, + "learning_rate": 2.7540630569243355e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5403 + }, + { + "completion_length": 788.8333740234375, + "epoch": 0.823780487804878, + "grad_norm": 0.16982023852728687, + "kl": 0.064697265625, + "learning_rate": 2.749455423301829e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5404 + }, + { + "completion_length": 2154.5, + "epoch": 0.8239329268292683, + "grad_norm": 0.09837051069653055, + "kl": 0.06103515625, + "learning_rate": 2.74485125834492e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5405 + }, + { + "completion_length": 1545.666748046875, + "epoch": 0.8240853658536585, + "grad_norm": 0.06050967465089201, + "kl": 0.04931640625, + "learning_rate": 2.7402505633572425e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5406 + }, + { + "completion_length": 1017.8333740234375, + "epoch": 0.8242378048780488, + "grad_norm": 0.1775629402626026, + "kl": 0.06689453125, + "learning_rate": 2.7356533396414514e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5407 + }, + { + "completion_length": 988.0000305175781, + "epoch": 0.824390243902439, + "grad_norm": 0.09478250703686832, + "kl": 0.065673828125, + "learning_rate": 2.7310595884992354e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5408 + }, + { + "completion_length": 1532.1667175292969, + "epoch": 0.8245426829268293, + "grad_norm": 0.08168678508953743, + "kl": 0.0439453125, + "learning_rate": 2.7264693112312765e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5409 + }, + { + "completion_length": 1856.5, + "epoch": 0.8246951219512195, + "grad_norm": 0.24250498123683661, + "kl": 0.078125, + "learning_rate": 2.721882509137286e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5410 + }, + { + "completion_length": 2501.8333740234375, + "epoch": 0.8248475609756097, + "grad_norm": 0.0692083996628844, + "kl": 0.0548095703125, + "learning_rate": 2.717299183515987e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5411 + }, + { + "completion_length": 847.5000305175781, + "epoch": 0.825, + "grad_norm": 0.10833043707927498, + "kl": 0.056884765625, + "learning_rate": 2.7127193356651214e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5412 + }, + { + "completion_length": 2247.3333740234375, + "epoch": 0.8251524390243903, + "grad_norm": 0.8431956124310356, + "kl": 0.067138671875, + "learning_rate": 2.7081429668814486e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5413 + }, + { + "completion_length": 1180.3333740234375, + "epoch": 0.8253048780487805, + "grad_norm": 0.08961952395442714, + "kl": 0.052490234375, + "learning_rate": 2.70357007846073e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5414 + }, + { + "completion_length": 1340.1666870117188, + "epoch": 0.8254573170731707, + "grad_norm": 0.2072053860698736, + "kl": 0.0601806640625, + "learning_rate": 2.699000671697755e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5415 + }, + { + "completion_length": 1683.5000610351562, + "epoch": 0.8256097560975609, + "grad_norm": 1.5765286512774677, + "kl": 0.0513916015625, + "learning_rate": 2.6944347478863226e-07, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5416 + }, + { + "completion_length": 1878.1666870117188, + "epoch": 0.8257621951219513, + "grad_norm": 1.4238607721224843, + "kl": 0.059814453125, + "learning_rate": 2.689872308319246e-07, + "loss": 0.0024, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5417 + }, + { + "completion_length": 939.6667175292969, + "epoch": 0.8259146341463415, + "grad_norm": 0.08503224462848566, + "kl": 0.0447998046875, + "learning_rate": 2.685313354288355e-07, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5418 + }, + { + "completion_length": 1623.8333740234375, + "epoch": 0.8260670731707317, + "grad_norm": 0.13501984300838785, + "kl": 0.073974609375, + "learning_rate": 2.6807578870844816e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5419 + }, + { + "completion_length": 1083.3333740234375, + "epoch": 0.8262195121951219, + "grad_norm": 0.15416802461560417, + "kl": 0.06591796875, + "learning_rate": 2.676205907997484e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5420 + }, + { + "completion_length": 2000.666748046875, + "epoch": 0.8263719512195122, + "grad_norm": 0.10873811453642147, + "kl": 0.064697265625, + "learning_rate": 2.671657418316223e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5421 + }, + { + "completion_length": 1055.1667175292969, + "epoch": 0.8265243902439025, + "grad_norm": 0.09734653755332022, + "kl": 0.0538330078125, + "learning_rate": 2.6671124193285823e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5422 + }, + { + "completion_length": 1201.6666870117188, + "epoch": 0.8266768292682927, + "grad_norm": 0.10954219714795808, + "kl": 0.07763671875, + "learning_rate": 2.66257091232144e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5423 + }, + { + "completion_length": 2259.666748046875, + "epoch": 0.8268292682926829, + "grad_norm": 0.16108489408781265, + "kl": 0.0614013671875, + "learning_rate": 2.658032898580702e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5424 + }, + { + "completion_length": 1079.8333740234375, + "epoch": 0.8269817073170732, + "grad_norm": 0.16283666993337337, + "kl": 0.07177734375, + "learning_rate": 2.653498379391275e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5425 + }, + { + "completion_length": 1669.166748046875, + "epoch": 0.8271341463414634, + "grad_norm": 0.11327216708454156, + "kl": 0.0601806640625, + "learning_rate": 2.6489673560370834e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5426 + }, + { + "completion_length": 596.6666717529297, + "epoch": 0.8272865853658536, + "grad_norm": 0.15455618136168153, + "kl": 0.0665283203125, + "learning_rate": 2.644439829801057e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5427 + }, + { + "completion_length": 1334.5, + "epoch": 0.8274390243902439, + "grad_norm": 0.09950726633660702, + "kl": 0.0631103515625, + "learning_rate": 2.6399158019651364e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5428 + }, + { + "completion_length": 1414.0000610351562, + "epoch": 0.8275914634146342, + "grad_norm": 0.08424111018321137, + "kl": 0.0615234375, + "learning_rate": 2.6353952738102726e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5429 + }, + { + "completion_length": 1230.166748046875, + "epoch": 0.8277439024390244, + "grad_norm": 0.13308045886533104, + "kl": 0.0562744140625, + "learning_rate": 2.630878246616425e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5430 + }, + { + "completion_length": 1427.8333740234375, + "epoch": 0.8278963414634146, + "grad_norm": 0.08357465044953469, + "kl": 0.0606689453125, + "learning_rate": 2.626364721662567e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5431 + }, + { + "completion_length": 1540.8333740234375, + "epoch": 0.8280487804878048, + "grad_norm": 1.0847723168953762, + "kl": 0.0733642578125, + "learning_rate": 2.621854700226663e-07, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5432 + }, + { + "completion_length": 857.8333435058594, + "epoch": 0.8282012195121952, + "grad_norm": 0.11453616624458128, + "kl": 0.07421875, + "learning_rate": 2.6173481835857065e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5433 + }, + { + "completion_length": 1306.0, + "epoch": 0.8283536585365854, + "grad_norm": 0.2703604883758105, + "kl": 0.090576171875, + "learning_rate": 2.612845173015687e-07, + "loss": 0.0036, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5434 + }, + { + "completion_length": 2567.166748046875, + "epoch": 0.8285060975609756, + "grad_norm": 0.056287208732041884, + "kl": 0.0487060546875, + "learning_rate": 2.608345669791605e-07, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5435 + }, + { + "completion_length": 1539.0, + "epoch": 0.8286585365853658, + "grad_norm": 1.8452778276879005, + "kl": 0.0574951171875, + "learning_rate": 2.603849675187469e-07, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5436 + }, + { + "completion_length": 1846.1667175292969, + "epoch": 0.8288109756097561, + "grad_norm": 0.0775540197411065, + "kl": 0.05078125, + "learning_rate": 2.599357190476285e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5437 + }, + { + "completion_length": 1203.5, + "epoch": 0.8289634146341464, + "grad_norm": 0.0735278606500387, + "kl": 0.04779052734375, + "learning_rate": 2.5948682169300775e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5438 + }, + { + "completion_length": 1573.8333435058594, + "epoch": 0.8291158536585366, + "grad_norm": 0.109241257482218, + "kl": 0.068603515625, + "learning_rate": 2.590382755819872e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5439 + }, + { + "completion_length": 1628.1666870117188, + "epoch": 0.8292682926829268, + "grad_norm": 0.10509766963960362, + "kl": 0.0570068359375, + "learning_rate": 2.5859008084156986e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5440 + }, + { + "completion_length": 1872.1666870117188, + "epoch": 0.8294207317073171, + "grad_norm": 0.06439862484075073, + "kl": 0.057373046875, + "learning_rate": 2.581422375986588e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5441 + }, + { + "completion_length": 1191.0000305175781, + "epoch": 0.8295731707317073, + "grad_norm": 0.10539786365235115, + "kl": 0.057861328125, + "learning_rate": 2.5769474598005866e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5442 + }, + { + "completion_length": 2244.0001220703125, + "epoch": 0.8297256097560975, + "grad_norm": 0.07177248226934489, + "kl": 0.064697265625, + "learning_rate": 2.5724760611247343e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5443 + }, + { + "completion_length": 1341.1666870117188, + "epoch": 0.8298780487804878, + "grad_norm": 0.09847721487233953, + "kl": 0.0555419921875, + "learning_rate": 2.5680081812250825e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5444 + }, + { + "completion_length": 1857.1666870117188, + "epoch": 0.8300304878048781, + "grad_norm": 0.12056878064969775, + "kl": 0.0665283203125, + "learning_rate": 2.5635438213666845e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5445 + }, + { + "completion_length": 2420.0, + "epoch": 0.8301829268292683, + "grad_norm": 0.06783386947282048, + "kl": 0.04833984375, + "learning_rate": 2.5590829828135956e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5446 + }, + { + "completion_length": 1415.8333740234375, + "epoch": 0.8303353658536585, + "grad_norm": 1.3213959223713116, + "kl": 0.07470703125, + "learning_rate": 2.5546256668288715e-07, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5447 + }, + { + "completion_length": 2238.0000610351562, + "epoch": 0.8304878048780487, + "grad_norm": 0.09378983903641971, + "kl": 0.047607421875, + "learning_rate": 2.5501718746745766e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5448 + }, + { + "completion_length": 1933.5000610351562, + "epoch": 0.8306402439024391, + "grad_norm": 0.08625821001270176, + "kl": 0.0721435546875, + "learning_rate": 2.545721607611779e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5449 + }, + { + "completion_length": 1363.3333740234375, + "epoch": 0.8307926829268293, + "grad_norm": 0.07190457598470548, + "kl": 0.0604248046875, + "learning_rate": 2.541274866900534e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5450 + }, + { + "completion_length": 1095.0, + "epoch": 0.8309451219512195, + "grad_norm": 0.08134997359447177, + "kl": 0.05224609375, + "learning_rate": 2.5368316537999134e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5451 + }, + { + "completion_length": 523.3333435058594, + "epoch": 0.8310975609756097, + "grad_norm": 0.18978177399451934, + "kl": 0.08837890625, + "learning_rate": 2.532391969567986e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5452 + }, + { + "completion_length": 988.8333740234375, + "epoch": 0.83125, + "grad_norm": 0.09048057388050876, + "kl": 0.068115234375, + "learning_rate": 2.52795581546182e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5453 + }, + { + "completion_length": 1347.666748046875, + "epoch": 0.8314024390243903, + "grad_norm": 0.14727490942970448, + "kl": 0.077392578125, + "learning_rate": 2.5235231927374893e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5454 + }, + { + "completion_length": 2065.166748046875, + "epoch": 0.8315548780487805, + "grad_norm": 0.14444278200204308, + "kl": 0.06103515625, + "learning_rate": 2.5190941026500565e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5455 + }, + { + "completion_length": 1473.8333435058594, + "epoch": 0.8317073170731707, + "grad_norm": 0.09213623144988949, + "kl": 0.054931640625, + "learning_rate": 2.514668546453592e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5456 + }, + { + "completion_length": 995.0000305175781, + "epoch": 0.831859756097561, + "grad_norm": 1.70877080918857, + "kl": 0.11962890625, + "learning_rate": 2.5102465254011683e-07, + "loss": 0.0048, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5457 + }, + { + "completion_length": 1015.8333740234375, + "epoch": 0.8320121951219512, + "grad_norm": 0.18498447491124279, + "kl": 0.064453125, + "learning_rate": 2.5058280407448534e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5458 + }, + { + "completion_length": 2101.3333740234375, + "epoch": 0.8321646341463415, + "grad_norm": 0.09342022579694058, + "kl": 0.056640625, + "learning_rate": 2.5014130937357094e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5459 + }, + { + "completion_length": 1152.3333740234375, + "epoch": 0.8323170731707317, + "grad_norm": 0.17557603698965096, + "kl": 0.0611572265625, + "learning_rate": 2.497001685623802e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5460 + }, + { + "completion_length": 2107.3334350585938, + "epoch": 0.832469512195122, + "grad_norm": 0.10549020142579824, + "kl": 0.05224609375, + "learning_rate": 2.4925938176581956e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5461 + }, + { + "completion_length": 934.0000305175781, + "epoch": 0.8326219512195122, + "grad_norm": 0.13198545342572945, + "kl": 0.072265625, + "learning_rate": 2.488189491086952e-07, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5462 + }, + { + "completion_length": 1070.8333740234375, + "epoch": 0.8327743902439024, + "grad_norm": 0.165913176740101, + "kl": 0.0655517578125, + "learning_rate": 2.483788707157126e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5463 + }, + { + "completion_length": 2667.5, + "epoch": 0.8329268292682926, + "grad_norm": 1.1782514104378654, + "kl": 0.0606689453125, + "learning_rate": 2.4793914671147745e-07, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5464 + }, + { + "completion_length": 1545.8333740234375, + "epoch": 0.833079268292683, + "grad_norm": 1.3688852912787606, + "kl": 0.062255859375, + "learning_rate": 2.474997772204947e-07, + "loss": 0.0025, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5465 + }, + { + "completion_length": 1696.166748046875, + "epoch": 0.8332317073170732, + "grad_norm": 0.0930741005869414, + "kl": 0.054931640625, + "learning_rate": 2.4706076236716925e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5466 + }, + { + "completion_length": 2031.3333740234375, + "epoch": 0.8333841463414634, + "grad_norm": 0.05807944602515757, + "kl": 0.05322265625, + "learning_rate": 2.4662210227580554e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5467 + }, + { + "completion_length": 1333.8333740234375, + "epoch": 0.8335365853658536, + "grad_norm": 0.15321814369661446, + "kl": 0.048828125, + "learning_rate": 2.4618379707060703e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5468 + }, + { + "completion_length": 1425.1666870117188, + "epoch": 0.833689024390244, + "grad_norm": 0.15087237506827444, + "kl": 0.085693359375, + "learning_rate": 2.457458468756773e-07, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5469 + }, + { + "completion_length": 760.0, + "epoch": 0.8338414634146342, + "grad_norm": 0.09312402441786793, + "kl": 0.0406494140625, + "learning_rate": 2.4530825181501926e-07, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5470 + }, + { + "completion_length": 1752.8333435058594, + "epoch": 0.8339939024390244, + "grad_norm": 0.10324847864096053, + "kl": 0.0582275390625, + "learning_rate": 2.4487101201253573e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5471 + }, + { + "completion_length": 2034.3333740234375, + "epoch": 0.8341463414634146, + "grad_norm": 0.06539809977710187, + "kl": 0.0537109375, + "learning_rate": 2.4443412759202745e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5472 + }, + { + "completion_length": 1208.0, + "epoch": 0.8342987804878049, + "grad_norm": 0.1046422366324183, + "kl": 0.0615234375, + "learning_rate": 2.43997598677196e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5473 + }, + { + "completion_length": 2303.8333740234375, + "epoch": 0.8344512195121951, + "grad_norm": 0.07996851486162888, + "kl": 0.0574951171875, + "learning_rate": 2.435614253916421e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5474 + }, + { + "completion_length": 1097.6667175292969, + "epoch": 0.8346036585365854, + "grad_norm": 1.3897527768838798, + "kl": 0.061279296875, + "learning_rate": 2.4312560785886513e-07, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5475 + }, + { + "completion_length": 801.5000305175781, + "epoch": 0.8347560975609756, + "grad_norm": 1.7866019099134278, + "kl": 0.0823974609375, + "learning_rate": 2.426901462022645e-07, + "loss": 0.0033, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5476 + }, + { + "completion_length": 1255.3333740234375, + "epoch": 0.8349085365853659, + "grad_norm": 0.17913491245755336, + "kl": 0.0596923828125, + "learning_rate": 2.422550405451381e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5477 + }, + { + "completion_length": 2658.166748046875, + "epoch": 0.8350609756097561, + "grad_norm": 0.05860682839916675, + "kl": 0.0606689453125, + "learning_rate": 2.418202910106834e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5478 + }, + { + "completion_length": 1690.5000610351562, + "epoch": 0.8352134146341463, + "grad_norm": 0.10175885439727017, + "kl": 0.0538330078125, + "learning_rate": 2.413858977219972e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5479 + }, + { + "completion_length": 1062.5, + "epoch": 0.8353658536585366, + "grad_norm": 0.1372938552698174, + "kl": 0.075927734375, + "learning_rate": 2.4095186080207505e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5480 + }, + { + "completion_length": 1792.5, + "epoch": 0.8355182926829269, + "grad_norm": 0.08967014756639662, + "kl": 0.0753173828125, + "learning_rate": 2.405181803738122e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5481 + }, + { + "completion_length": 2131.8333740234375, + "epoch": 0.8356707317073171, + "grad_norm": 1.0355725414698975, + "kl": 0.0654296875, + "learning_rate": 2.4008485656000223e-07, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5482 + }, + { + "completion_length": 742.6666717529297, + "epoch": 0.8358231707317073, + "grad_norm": 0.10666855738301763, + "kl": 0.068115234375, + "learning_rate": 2.396518894833382e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5483 + }, + { + "completion_length": 1834.666748046875, + "epoch": 0.8359756097560975, + "grad_norm": 0.0903286760975359, + "kl": 0.0535888671875, + "learning_rate": 2.392192792664121e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5484 + }, + { + "completion_length": 1582.0, + "epoch": 0.8361280487804879, + "grad_norm": 1.5889657601629041, + "kl": 0.0631103515625, + "learning_rate": 2.3878702603171513e-07, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5485 + }, + { + "completion_length": 1003.5000610351562, + "epoch": 0.8362804878048781, + "grad_norm": 0.1121258663607277, + "kl": 0.048095703125, + "learning_rate": 2.3835512990163638e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5486 + }, + { + "completion_length": 1664.0000610351562, + "epoch": 0.8364329268292683, + "grad_norm": 0.08582292259226383, + "kl": 0.0518798828125, + "learning_rate": 2.37923590998465e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5487 + }, + { + "completion_length": 2807.1666870117188, + "epoch": 0.8365853658536585, + "grad_norm": 0.0915567484377665, + "kl": 0.0621337890625, + "learning_rate": 2.3749240944438845e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5488 + }, + { + "completion_length": 2216.5000610351562, + "epoch": 0.8367378048780488, + "grad_norm": 0.059077765009927084, + "kl": 0.056640625, + "learning_rate": 2.3706158536149337e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5489 + }, + { + "completion_length": 1994.8333740234375, + "epoch": 0.836890243902439, + "grad_norm": 0.1940727952962175, + "kl": 0.06298828125, + "learning_rate": 2.3663111887176435e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5490 + }, + { + "completion_length": 2198.666748046875, + "epoch": 0.8370426829268293, + "grad_norm": 2.337772863224339, + "kl": 0.07080078125, + "learning_rate": 2.3620101009708577e-07, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5491 + }, + { + "completion_length": 827.6666870117188, + "epoch": 0.8371951219512195, + "grad_norm": 0.12906719306603856, + "kl": 0.0538330078125, + "learning_rate": 2.3577125915924004e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5492 + }, + { + "completion_length": 1077.1666870117188, + "epoch": 0.8373475609756098, + "grad_norm": 0.08602978261476495, + "kl": 0.0653076171875, + "learning_rate": 2.3534186617990861e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5493 + }, + { + "completion_length": 1188.3333740234375, + "epoch": 0.8375, + "grad_norm": 1.3973164360740589, + "kl": 0.06494140625, + "learning_rate": 2.3491283128067176e-07, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5494 + }, + { + "completion_length": 1428.666748046875, + "epoch": 0.8376524390243902, + "grad_norm": 0.14577202665650554, + "kl": 0.04736328125, + "learning_rate": 2.3448415458300732e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5495 + }, + { + "completion_length": 853.3333435058594, + "epoch": 0.8378048780487805, + "grad_norm": 3.4078470731827726, + "kl": 0.0653076171875, + "learning_rate": 2.3405583620829268e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5496 + }, + { + "completion_length": 1838.5, + "epoch": 0.8379573170731708, + "grad_norm": 0.06548711989665804, + "kl": 0.052490234375, + "learning_rate": 2.3362787627780368e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5497 + }, + { + "completion_length": 2401.666748046875, + "epoch": 0.838109756097561, + "grad_norm": 0.06328584705054788, + "kl": 0.04296875, + "learning_rate": 2.3320027491271457e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5498 + }, + { + "completion_length": 2612.166748046875, + "epoch": 0.8382621951219512, + "grad_norm": 1.637486580754887, + "kl": 0.0513916015625, + "learning_rate": 2.3277303223409812e-07, + "loss": 0.0021, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5499 + }, + { + "completion_length": 1923.5000610351562, + "epoch": 0.8384146341463414, + "grad_norm": 0.11981892247638065, + "kl": 0.0628662109375, + "learning_rate": 2.3234614836292462e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5500 + }, + { + "completion_length": 1159.166748046875, + "epoch": 0.8385670731707318, + "grad_norm": 0.0876632262210615, + "kl": 0.0599365234375, + "learning_rate": 2.3191962342006467e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5501 + }, + { + "completion_length": 1491.3333740234375, + "epoch": 0.838719512195122, + "grad_norm": 0.1402588866511298, + "kl": 0.083984375, + "learning_rate": 2.3149345752628588e-07, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5502 + }, + { + "completion_length": 2083.666748046875, + "epoch": 0.8388719512195122, + "grad_norm": 0.09596453462887418, + "kl": 0.0501708984375, + "learning_rate": 2.31067650802254e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5503 + }, + { + "completion_length": 746.5, + "epoch": 0.8390243902439024, + "grad_norm": 0.11816494528799286, + "kl": 0.0633544921875, + "learning_rate": 2.3064220336853398e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5504 + }, + { + "completion_length": 1201.0, + "epoch": 0.8391768292682927, + "grad_norm": 1.6151366467776151, + "kl": 0.0618896484375, + "learning_rate": 2.3021711534558843e-07, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5505 + }, + { + "completion_length": 1757.3334350585938, + "epoch": 0.839329268292683, + "grad_norm": 0.07560096737750897, + "kl": 0.052001953125, + "learning_rate": 2.297923868537784e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5506 + }, + { + "completion_length": 1333.0, + "epoch": 0.8394817073170732, + "grad_norm": 0.09064983951014989, + "kl": 0.0635986328125, + "learning_rate": 2.2936801801336354e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5507 + }, + { + "completion_length": 673.6666870117188, + "epoch": 0.8396341463414634, + "grad_norm": 0.10083304212462188, + "kl": 0.0709228515625, + "learning_rate": 2.289440089445004e-07, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5508 + }, + { + "completion_length": 2266.5, + "epoch": 0.8397865853658537, + "grad_norm": 0.060855526741928054, + "kl": 0.0472412109375, + "learning_rate": 2.285203597672451e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5509 + }, + { + "completion_length": 1469.5000305175781, + "epoch": 0.8399390243902439, + "grad_norm": 0.08720519765663184, + "kl": 0.062744140625, + "learning_rate": 2.2809707060155123e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5510 + }, + { + "completion_length": 760.3333435058594, + "epoch": 0.8400914634146341, + "grad_norm": 0.12388830268411605, + "kl": 0.087646484375, + "learning_rate": 2.2767414156727034e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5511 + }, + { + "completion_length": 1666.6666870117188, + "epoch": 0.8402439024390244, + "grad_norm": 0.1547831719671116, + "kl": 0.0557861328125, + "learning_rate": 2.272515727841527e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5512 + }, + { + "completion_length": 1876.166748046875, + "epoch": 0.8403963414634147, + "grad_norm": 0.09203094054522432, + "kl": 0.06005859375, + "learning_rate": 2.268293643718451e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5513 + }, + { + "completion_length": 1337.3333740234375, + "epoch": 0.8405487804878049, + "grad_norm": 1.5276481509024815, + "kl": 0.082763671875, + "learning_rate": 2.264075164498937e-07, + "loss": 0.0033, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5514 + }, + { + "completion_length": 861.6666870117188, + "epoch": 0.8407012195121951, + "grad_norm": 0.0821096502961863, + "kl": 0.054931640625, + "learning_rate": 2.259860291377422e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5515 + }, + { + "completion_length": 1456.0000610351562, + "epoch": 0.8408536585365853, + "grad_norm": 0.11619622987924647, + "kl": 0.0660400390625, + "learning_rate": 2.2556490255473205e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5516 + }, + { + "completion_length": 1364.3333740234375, + "epoch": 0.8410060975609757, + "grad_norm": 0.14298140206511029, + "kl": 0.0511474609375, + "learning_rate": 2.2514413682010292e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5517 + }, + { + "completion_length": 585.6666870117188, + "epoch": 0.8411585365853659, + "grad_norm": 0.22197171721333014, + "kl": 0.082275390625, + "learning_rate": 2.2472373205299102e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5518 + }, + { + "completion_length": 2422.8333740234375, + "epoch": 0.8413109756097561, + "grad_norm": 0.06381450175939753, + "kl": 0.050048828125, + "learning_rate": 2.2430368837243242e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5519 + }, + { + "completion_length": 1624.3334350585938, + "epoch": 0.8414634146341463, + "grad_norm": 0.15561849845734715, + "kl": 0.0751953125, + "learning_rate": 2.2388400589735985e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5520 + }, + { + "completion_length": 897.6666717529297, + "epoch": 0.8416158536585366, + "grad_norm": 0.11286733498127707, + "kl": 0.052734375, + "learning_rate": 2.234646847466031e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5521 + }, + { + "completion_length": 1573.0, + "epoch": 0.8417682926829269, + "grad_norm": 0.11347953950310379, + "kl": 0.05419921875, + "learning_rate": 2.2304572503889065e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5522 + }, + { + "completion_length": 1116.1667175292969, + "epoch": 0.8419207317073171, + "grad_norm": 0.07640730685865843, + "kl": 0.048828125, + "learning_rate": 2.226271268928483e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5523 + }, + { + "completion_length": 1980.6667175292969, + "epoch": 0.8420731707317073, + "grad_norm": 0.19340641421110538, + "kl": 0.0535888671875, + "learning_rate": 2.2220889042699976e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5524 + }, + { + "completion_length": 1195.8333740234375, + "epoch": 0.8422256097560976, + "grad_norm": 0.09580958924765218, + "kl": 0.0438232421875, + "learning_rate": 2.217910157597661e-07, + "loss": 0.0018, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 5525 + }, + { + "completion_length": 1309.5000305175781, + "epoch": 0.8423780487804878, + "grad_norm": 0.09092055715481394, + "kl": 0.061767578125, + "learning_rate": 2.2137350300946557e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5526 + }, + { + "completion_length": 1935.0000915527344, + "epoch": 0.842530487804878, + "grad_norm": 1.6934002634778822, + "kl": 0.065673828125, + "learning_rate": 2.2095635229431437e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5527 + }, + { + "completion_length": 973.5, + "epoch": 0.8426829268292683, + "grad_norm": 0.1001020619982202, + "kl": 0.0615234375, + "learning_rate": 2.205395637324264e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5528 + }, + { + "completion_length": 1505.1667175292969, + "epoch": 0.8428353658536586, + "grad_norm": 0.15591661100693868, + "kl": 0.072265625, + "learning_rate": 2.2012313744181256e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5529 + }, + { + "completion_length": 1413.0000610351562, + "epoch": 0.8429878048780488, + "grad_norm": 1.8173721061620327, + "kl": 0.054443359375, + "learning_rate": 2.197070735403817e-07, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5530 + }, + { + "completion_length": 2386.6666870117188, + "epoch": 0.843140243902439, + "grad_norm": 0.08067717945285832, + "kl": 0.051513671875, + "learning_rate": 2.1929137214593915e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5531 + }, + { + "completion_length": 1744.0, + "epoch": 0.8432926829268292, + "grad_norm": 1.0796328533658173, + "kl": 0.0587158203125, + "learning_rate": 2.188760333761885e-07, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5532 + }, + { + "completion_length": 866.3333740234375, + "epoch": 0.8434451219512196, + "grad_norm": 1.9518465738835449, + "kl": 0.0712890625, + "learning_rate": 2.1846105734873022e-07, + "loss": 0.0028, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5533 + }, + { + "completion_length": 2753.3334350585938, + "epoch": 0.8435975609756098, + "grad_norm": 0.06090333751325149, + "kl": 0.0521240234375, + "learning_rate": 2.1804644418106268e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5534 + }, + { + "completion_length": 1552.0000610351562, + "epoch": 0.84375, + "grad_norm": 0.10191173390126167, + "kl": 0.0693359375, + "learning_rate": 2.1763219399058043e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5535 + }, + { + "completion_length": 932.3333740234375, + "epoch": 0.8439024390243902, + "grad_norm": 0.11523776112912675, + "kl": 0.08203125, + "learning_rate": 2.1721830689457583e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5536 + }, + { + "completion_length": 1907.8333740234375, + "epoch": 0.8440548780487804, + "grad_norm": 0.08156058960399004, + "kl": 0.05908203125, + "learning_rate": 2.1680478301023848e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5537 + }, + { + "completion_length": 1778.0001220703125, + "epoch": 0.8442073170731708, + "grad_norm": 0.13644722329738027, + "kl": 0.069580078125, + "learning_rate": 2.1639162245465587e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5538 + }, + { + "completion_length": 1552.0, + "epoch": 0.844359756097561, + "grad_norm": 0.0965500810395119, + "kl": 0.062744140625, + "learning_rate": 2.1597882534481094e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5539 + }, + { + "completion_length": 1680.3333740234375, + "epoch": 0.8445121951219512, + "grad_norm": 0.1345700656778597, + "kl": 0.0628662109375, + "learning_rate": 2.1556639179758502e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5540 + }, + { + "completion_length": 1273.3333435058594, + "epoch": 0.8446646341463414, + "grad_norm": 0.09327059526331667, + "kl": 0.0421142578125, + "learning_rate": 2.1515432192975587e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5541 + }, + { + "completion_length": 1305.1666870117188, + "epoch": 0.8448170731707317, + "grad_norm": 0.11033165343846449, + "kl": 0.055908203125, + "learning_rate": 2.147426158579988e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5542 + }, + { + "completion_length": 1728.666748046875, + "epoch": 0.844969512195122, + "grad_norm": 0.1876705806472103, + "kl": 0.06787109375, + "learning_rate": 2.1433127369888593e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5543 + }, + { + "completion_length": 1215.6666870117188, + "epoch": 0.8451219512195122, + "grad_norm": 0.11360694748450709, + "kl": 0.0732421875, + "learning_rate": 2.1392029556888576e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5544 + }, + { + "completion_length": 2428.166748046875, + "epoch": 0.8452743902439024, + "grad_norm": 1.8249336033633579, + "kl": 0.050048828125, + "learning_rate": 2.1350968158436434e-07, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5545 + }, + { + "completion_length": 2067.5, + "epoch": 0.8454268292682927, + "grad_norm": 0.10019966076003822, + "kl": 0.04736328125, + "learning_rate": 2.130994318615847e-07, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5546 + }, + { + "completion_length": 939.6667175292969, + "epoch": 0.8455792682926829, + "grad_norm": 2.0005969041982254, + "kl": 0.093017578125, + "learning_rate": 2.1268954651670625e-07, + "loss": 0.0037, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5547 + }, + { + "completion_length": 1051.3333435058594, + "epoch": 0.8457317073170731, + "grad_norm": 0.1522518751320278, + "kl": 0.0538330078125, + "learning_rate": 2.1228002566578598e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5548 + }, + { + "completion_length": 2569.3334350585938, + "epoch": 0.8458841463414634, + "grad_norm": 0.0883729596928537, + "kl": 0.060791015625, + "learning_rate": 2.1187086942477628e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5549 + }, + { + "completion_length": 1438.0000610351562, + "epoch": 0.8460365853658537, + "grad_norm": 0.3385039215081196, + "kl": 0.0537109375, + "learning_rate": 2.1146207790952786e-07, + "loss": 0.0021, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 5550 + }, + { + "completion_length": 2641.0, + "epoch": 0.8461890243902439, + "grad_norm": 0.10482272603576855, + "kl": 0.0634765625, + "learning_rate": 2.1105365123578746e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5551 + }, + { + "completion_length": 1475.8334350585938, + "epoch": 0.8463414634146341, + "grad_norm": 2.2148012190475797, + "kl": 0.072265625, + "learning_rate": 2.1064558951919854e-07, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5552 + }, + { + "completion_length": 885.8333435058594, + "epoch": 0.8464939024390243, + "grad_norm": 0.1434638995657928, + "kl": 0.073486328125, + "learning_rate": 2.1023789287530104e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5553 + }, + { + "completion_length": 1892.5, + "epoch": 0.8466463414634147, + "grad_norm": 0.11846583012723659, + "kl": 0.0665283203125, + "learning_rate": 2.098305614195316e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5554 + }, + { + "completion_length": 1132.8333435058594, + "epoch": 0.8467987804878049, + "grad_norm": 0.0869735093719931, + "kl": 0.0657958984375, + "learning_rate": 2.0942359526722364e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5555 + }, + { + "completion_length": 1104.5, + "epoch": 0.8469512195121951, + "grad_norm": 0.14778815752807226, + "kl": 0.0543212890625, + "learning_rate": 2.0901699453360784e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5556 + }, + { + "completion_length": 1656.166748046875, + "epoch": 0.8471036585365853, + "grad_norm": 0.06939578245251651, + "kl": 0.0592041015625, + "learning_rate": 2.086107593338099e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5557 + }, + { + "completion_length": 1839.666748046875, + "epoch": 0.8472560975609756, + "grad_norm": 0.06833582418263846, + "kl": 0.0556640625, + "learning_rate": 2.0820488978285284e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5558 + }, + { + "completion_length": 2111.8333740234375, + "epoch": 0.8474085365853659, + "grad_norm": 1.0866862722428332, + "kl": 0.051513671875, + "learning_rate": 2.077993859956564e-07, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5559 + }, + { + "completion_length": 1254.8333740234375, + "epoch": 0.8475609756097561, + "grad_norm": 2.3047704472643487, + "kl": 0.070068359375, + "learning_rate": 2.0739424808703638e-07, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5560 + }, + { + "completion_length": 1499.6666870117188, + "epoch": 0.8477134146341463, + "grad_norm": 2.212869111291082, + "kl": 0.06005859375, + "learning_rate": 2.069894761717052e-07, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5561 + }, + { + "completion_length": 1445.8333740234375, + "epoch": 0.8478658536585366, + "grad_norm": 0.07807093765490503, + "kl": 0.0550537109375, + "learning_rate": 2.0658507036427105e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5562 + }, + { + "completion_length": 759.1666870117188, + "epoch": 0.8480182926829268, + "grad_norm": 0.19607456397949302, + "kl": 0.062744140625, + "learning_rate": 2.0618103077923912e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5563 + }, + { + "completion_length": 2074.5000610351562, + "epoch": 0.848170731707317, + "grad_norm": 0.11322470045372145, + "kl": 0.0587158203125, + "learning_rate": 2.057773575310109e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5564 + }, + { + "completion_length": 2008.5001220703125, + "epoch": 0.8483231707317073, + "grad_norm": 0.22439309477374586, + "kl": 0.075927734375, + "learning_rate": 2.0537405073388416e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5565 + }, + { + "completion_length": 2167.0, + "epoch": 0.8484756097560976, + "grad_norm": 0.08783757811614323, + "kl": 0.0565185546875, + "learning_rate": 2.0497111050205204e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5566 + }, + { + "completion_length": 1276.6667175292969, + "epoch": 0.8486280487804878, + "grad_norm": 0.19630233781660722, + "kl": 0.0478515625, + "learning_rate": 2.0456853694960477e-07, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5567 + }, + { + "completion_length": 1838.6667175292969, + "epoch": 0.848780487804878, + "grad_norm": 0.11201267002008879, + "kl": 0.0494384765625, + "learning_rate": 2.0416633019052882e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5568 + }, + { + "completion_length": 2756.8333740234375, + "epoch": 0.8489329268292682, + "grad_norm": 0.046206371550287735, + "kl": 0.0465087890625, + "learning_rate": 2.0376449033870648e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5569 + }, + { + "completion_length": 783.3333435058594, + "epoch": 0.8490853658536586, + "grad_norm": 0.08674486030540837, + "kl": 0.0430908203125, + "learning_rate": 2.0336301750791653e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5570 + }, + { + "completion_length": 824.3333587646484, + "epoch": 0.8492378048780488, + "grad_norm": 2.3718301534327737, + "kl": 0.09033203125, + "learning_rate": 2.029619118118327e-07, + "loss": 0.0036, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5571 + }, + { + "completion_length": 2021.8333740234375, + "epoch": 0.849390243902439, + "grad_norm": 1.9343798701259074, + "kl": 0.0496826171875, + "learning_rate": 2.0256117336402586e-07, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5572 + }, + { + "completion_length": 1529.0, + "epoch": 0.8495426829268292, + "grad_norm": 0.08223632506784234, + "kl": 0.0648193359375, + "learning_rate": 2.0216080227796268e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5573 + }, + { + "completion_length": 1121.5000305175781, + "epoch": 0.8496951219512195, + "grad_norm": 0.12506370661689722, + "kl": 0.0609130859375, + "learning_rate": 2.0176079866700638e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5574 + }, + { + "completion_length": 1293.3333740234375, + "epoch": 0.8498475609756098, + "grad_norm": 0.12551214959604667, + "kl": 0.07568359375, + "learning_rate": 2.0136116264441446e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5575 + }, + { + "completion_length": 1131.3333740234375, + "epoch": 0.85, + "grad_norm": 0.11562080822475178, + "kl": 0.057373046875, + "learning_rate": 2.0096189432334195e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5576 + }, + { + "completion_length": 1626.3334350585938, + "epoch": 0.8501524390243902, + "grad_norm": 1.113919731495909, + "kl": 0.0526123046875, + "learning_rate": 2.0056299381683924e-07, + "loss": 0.0021, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5577 + }, + { + "completion_length": 2316.5000610351562, + "epoch": 0.8503048780487805, + "grad_norm": 0.05911177296433131, + "kl": 0.0499267578125, + "learning_rate": 2.0016446123785214e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5578 + }, + { + "completion_length": 2682.8333740234375, + "epoch": 0.8504573170731707, + "grad_norm": 0.05520720881834136, + "kl": 0.0452880859375, + "learning_rate": 1.9976629669922343e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5579 + }, + { + "completion_length": 1281.0000610351562, + "epoch": 0.850609756097561, + "grad_norm": 0.42354908354092247, + "kl": 0.0599365234375, + "learning_rate": 1.9936850031369003e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5580 + }, + { + "completion_length": 1577.6666870117188, + "epoch": 0.8507621951219512, + "grad_norm": 0.08773707546540804, + "kl": 0.0562744140625, + "learning_rate": 1.9897107219388588e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5581 + }, + { + "completion_length": 2581.166748046875, + "epoch": 0.8509146341463415, + "grad_norm": 0.09332539030074569, + "kl": 0.05615234375, + "learning_rate": 1.985740124523403e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5582 + }, + { + "completion_length": 1953.666748046875, + "epoch": 0.8510670731707317, + "grad_norm": 0.08941399052054853, + "kl": 0.067626953125, + "learning_rate": 1.9817732120147869e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5583 + }, + { + "completion_length": 2568.6666870117188, + "epoch": 0.8512195121951219, + "grad_norm": 0.06806834394349892, + "kl": 0.0458984375, + "learning_rate": 1.9778099855362085e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5584 + }, + { + "completion_length": 792.6666870117188, + "epoch": 0.8513719512195121, + "grad_norm": 2.337633042426228, + "kl": 0.079345703125, + "learning_rate": 1.9738504462098372e-07, + "loss": 0.0032, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5585 + }, + { + "completion_length": 1733.5000610351562, + "epoch": 0.8515243902439025, + "grad_norm": 1.726085663963514, + "kl": 0.064453125, + "learning_rate": 1.9698945951567891e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5586 + }, + { + "completion_length": 2852.166748046875, + "epoch": 0.8516768292682927, + "grad_norm": 0.05128009098894183, + "kl": 0.0390625, + "learning_rate": 1.9659424334971404e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5587 + }, + { + "completion_length": 1584.3333740234375, + "epoch": 0.8518292682926829, + "grad_norm": 0.09267232983784905, + "kl": 0.0523681640625, + "learning_rate": 1.9619939623499238e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5588 + }, + { + "completion_length": 650.0000305175781, + "epoch": 0.8519817073170731, + "grad_norm": 0.5884568178842503, + "kl": 0.0618896484375, + "learning_rate": 1.958049182833117e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5589 + }, + { + "completion_length": 1588.8333435058594, + "epoch": 0.8521341463414634, + "grad_norm": 0.11428704624960846, + "kl": 0.0545654296875, + "learning_rate": 1.9541080960636664e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5590 + }, + { + "completion_length": 1329.0000305175781, + "epoch": 0.8522865853658537, + "grad_norm": 0.0773454927576476, + "kl": 0.0526123046875, + "learning_rate": 1.950170703157461e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5591 + }, + { + "completion_length": 1575.3333740234375, + "epoch": 0.8524390243902439, + "grad_norm": 0.08344653687642074, + "kl": 0.06005859375, + "learning_rate": 1.9462370052293544e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5592 + }, + { + "completion_length": 2688.8333740234375, + "epoch": 0.8525914634146341, + "grad_norm": 0.07473714912547161, + "kl": 0.053466796875, + "learning_rate": 1.942307003393145e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5593 + }, + { + "completion_length": 1697.6666870117188, + "epoch": 0.8527439024390244, + "grad_norm": 0.06027135849561963, + "kl": 0.0501708984375, + "learning_rate": 1.9383806987615897e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5594 + }, + { + "completion_length": 2289.666717529297, + "epoch": 0.8528963414634146, + "grad_norm": 0.07909885170816439, + "kl": 0.0445556640625, + "learning_rate": 1.9344580924463994e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5595 + }, + { + "completion_length": 1806.166748046875, + "epoch": 0.8530487804878049, + "grad_norm": 0.1055441134792981, + "kl": 0.060546875, + "learning_rate": 1.9305391855582355e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5596 + }, + { + "completion_length": 777.1666870117188, + "epoch": 0.8532012195121951, + "grad_norm": 0.19728635906837702, + "kl": 0.0823974609375, + "learning_rate": 1.926623979206707e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5597 + }, + { + "completion_length": 2876.3333740234375, + "epoch": 0.8533536585365854, + "grad_norm": 0.9482483156377308, + "kl": 0.049560546875, + "learning_rate": 1.9227124745003822e-07, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5598 + }, + { + "completion_length": 968.8333740234375, + "epoch": 0.8535060975609756, + "grad_norm": 0.13321228291729337, + "kl": 0.0660400390625, + "learning_rate": 1.9188046725467816e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5599 + }, + { + "completion_length": 893.8333435058594, + "epoch": 0.8536585365853658, + "grad_norm": 0.10963080395021038, + "kl": 0.039306640625, + "learning_rate": 1.9149005744523757e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5600 + }, + { + "completion_length": 1492.5, + "epoch": 0.853810975609756, + "grad_norm": 0.07510661671051287, + "kl": 0.0626220703125, + "learning_rate": 1.9110001813225863e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5601 + }, + { + "completion_length": 2266.6666870117188, + "epoch": 0.8539634146341464, + "grad_norm": 0.10074141765679477, + "kl": 0.0526123046875, + "learning_rate": 1.9071034942617786e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5602 + }, + { + "completion_length": 1910.3333740234375, + "epoch": 0.8541158536585366, + "grad_norm": 2.200312216154909, + "kl": 0.065673828125, + "learning_rate": 1.903210514373282e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5603 + }, + { + "completion_length": 2298.8333740234375, + "epoch": 0.8542682926829268, + "grad_norm": 0.08026933282724852, + "kl": 0.0550537109375, + "learning_rate": 1.8993212427593658e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5604 + }, + { + "completion_length": 1909.5, + "epoch": 0.854420731707317, + "grad_norm": 0.08030892991487923, + "kl": 0.0526123046875, + "learning_rate": 1.8954356805212563e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5605 + }, + { + "completion_length": 1631.5, + "epoch": 0.8545731707317074, + "grad_norm": 0.08850214219842704, + "kl": 0.05126953125, + "learning_rate": 1.8915538287591284e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5606 + }, + { + "completion_length": 1028.1666870117188, + "epoch": 0.8547256097560976, + "grad_norm": 2.0168175778061954, + "kl": 0.10009765625, + "learning_rate": 1.8876756885720958e-07, + "loss": 0.004, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5607 + }, + { + "completion_length": 2588.33349609375, + "epoch": 0.8548780487804878, + "grad_norm": 0.09141158076197244, + "kl": 0.0533447265625, + "learning_rate": 1.8838012610582356e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5608 + }, + { + "completion_length": 1949.8334350585938, + "epoch": 0.855030487804878, + "grad_norm": 0.1283985062645339, + "kl": 0.06103515625, + "learning_rate": 1.8799305473145667e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5609 + }, + { + "completion_length": 3016.1666870117188, + "epoch": 0.8551829268292683, + "grad_norm": 0.06524091271309224, + "kl": 0.0557861328125, + "learning_rate": 1.8760635484370591e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5610 + }, + { + "completion_length": 2349.3333740234375, + "epoch": 0.8553353658536585, + "grad_norm": 0.04302095405779245, + "kl": 0.03704833984375, + "learning_rate": 1.8722002655206272e-07, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5611 + }, + { + "completion_length": 1736.3334350585938, + "epoch": 0.8554878048780488, + "grad_norm": 1.6815093872402973, + "kl": 0.049560546875, + "learning_rate": 1.8683406996591373e-07, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5612 + }, + { + "completion_length": 1040.0000305175781, + "epoch": 0.855640243902439, + "grad_norm": 0.20954236209414048, + "kl": 0.0543212890625, + "learning_rate": 1.8644848519454022e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5613 + }, + { + "completion_length": 1924.166748046875, + "epoch": 0.8557926829268293, + "grad_norm": 0.06972555770701855, + "kl": 0.0579833984375, + "learning_rate": 1.8606327234711818e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5614 + }, + { + "completion_length": 860.1667175292969, + "epoch": 0.8559451219512195, + "grad_norm": 0.0941690741867977, + "kl": 0.046875, + "learning_rate": 1.856784315327177e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5615 + }, + { + "completion_length": 2065.666748046875, + "epoch": 0.8560975609756097, + "grad_norm": 0.06767296025481413, + "kl": 0.0458984375, + "learning_rate": 1.852939628603046e-07, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5616 + }, + { + "completion_length": 1863.0, + "epoch": 0.85625, + "grad_norm": 2.121789830602077, + "kl": 0.0535888671875, + "learning_rate": 1.8490986643873847e-07, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5617 + }, + { + "completion_length": 2167.83349609375, + "epoch": 0.8564024390243903, + "grad_norm": 0.10096615965672831, + "kl": 0.067626953125, + "learning_rate": 1.84526142376774e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5618 + }, + { + "completion_length": 848.5, + "epoch": 0.8565548780487805, + "grad_norm": 0.2179213578924609, + "kl": 0.0843505859375, + "learning_rate": 1.8414279078306046e-07, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5619 + }, + { + "completion_length": 1180.3333740234375, + "epoch": 0.8567073170731707, + "grad_norm": 0.09234591536582074, + "kl": 0.0455322265625, + "learning_rate": 1.8375981176614114e-07, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5620 + }, + { + "completion_length": 1512.166748046875, + "epoch": 0.8568597560975609, + "grad_norm": 2.265073416327829, + "kl": 0.0875244140625, + "learning_rate": 1.8337720543445417e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5621 + }, + { + "completion_length": 1437.1666870117188, + "epoch": 0.8570121951219513, + "grad_norm": 0.10467967193609334, + "kl": 0.0584716796875, + "learning_rate": 1.8299497189633223e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5622 + }, + { + "completion_length": 1084.8333740234375, + "epoch": 0.8571646341463415, + "grad_norm": 0.12422424901590529, + "kl": 0.06298828125, + "learning_rate": 1.8261311126000256e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5623 + }, + { + "completion_length": 810.8333740234375, + "epoch": 0.8573170731707317, + "grad_norm": 0.17987492007946712, + "kl": 0.0606689453125, + "learning_rate": 1.822316236335867e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5624 + }, + { + "completion_length": 2261.6666717529297, + "epoch": 0.8574695121951219, + "grad_norm": 2.029478063367512, + "kl": 0.0574951171875, + "learning_rate": 1.8185050912510004e-07, + "loss": 0.0023, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5625 + }, + { + "completion_length": 1094.0000610351562, + "epoch": 0.8576219512195122, + "grad_norm": 0.08554245431945595, + "kl": 0.05224609375, + "learning_rate": 1.8146976784245296e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5626 + }, + { + "completion_length": 1884.8333740234375, + "epoch": 0.8577743902439025, + "grad_norm": 0.07518628789121769, + "kl": 0.056640625, + "learning_rate": 1.8108939989345001e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5627 + }, + { + "completion_length": 1437.666748046875, + "epoch": 0.8579268292682927, + "grad_norm": 1.1548855782320153, + "kl": 0.047607421875, + "learning_rate": 1.8070940538579044e-07, + "loss": 0.0019, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5628 + }, + { + "completion_length": 1328.3333740234375, + "epoch": 0.8580792682926829, + "grad_norm": 0.09460054261496313, + "kl": 0.0609130859375, + "learning_rate": 1.8032978442706632e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5629 + }, + { + "completion_length": 811.6666870117188, + "epoch": 0.8582317073170732, + "grad_norm": 0.11683697192213725, + "kl": 0.0382080078125, + "learning_rate": 1.79950537124766e-07, + "loss": 0.0015, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5630 + }, + { + "completion_length": 1373.666748046875, + "epoch": 0.8583841463414634, + "grad_norm": 0.0905196298953356, + "kl": 0.045654296875, + "learning_rate": 1.7957166358627042e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5631 + }, + { + "completion_length": 2842.666748046875, + "epoch": 0.8585365853658536, + "grad_norm": 0.06044600274992132, + "kl": 0.0511474609375, + "learning_rate": 1.7919316391885593e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5632 + }, + { + "completion_length": 2384.666748046875, + "epoch": 0.8586890243902439, + "grad_norm": 0.05481304785862157, + "kl": 0.0494384765625, + "learning_rate": 1.7881503822969147e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5633 + }, + { + "completion_length": 1191.8333740234375, + "epoch": 0.8588414634146342, + "grad_norm": 1.4424742743386796, + "kl": 0.048828125, + "learning_rate": 1.7843728662584141e-07, + "loss": 0.0019, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5634 + }, + { + "completion_length": 1725.0, + "epoch": 0.8589939024390244, + "grad_norm": 0.10329049363762947, + "kl": 0.0550537109375, + "learning_rate": 1.7805990921426385e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5635 + }, + { + "completion_length": 2928.8333740234375, + "epoch": 0.8591463414634146, + "grad_norm": 0.07143452385579796, + "kl": 0.0438232421875, + "learning_rate": 1.7768290610181065e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5636 + }, + { + "completion_length": 1128.1666870117188, + "epoch": 0.8592987804878048, + "grad_norm": 0.15242800164662235, + "kl": 0.103759765625, + "learning_rate": 1.7730627739522826e-07, + "loss": 0.0041, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5637 + }, + { + "completion_length": 876.8333740234375, + "epoch": 0.8594512195121952, + "grad_norm": 0.11291608594242027, + "kl": 0.054443359375, + "learning_rate": 1.7693002320115636e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5638 + }, + { + "completion_length": 1389.5, + "epoch": 0.8596036585365854, + "grad_norm": 0.2260569130983396, + "kl": 0.053466796875, + "learning_rate": 1.7655414362612892e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5639 + }, + { + "completion_length": 1899.666748046875, + "epoch": 0.8597560975609756, + "grad_norm": 1.0659764966901564, + "kl": 0.0494384765625, + "learning_rate": 1.761786387765743e-07, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5640 + }, + { + "completion_length": 2047.166748046875, + "epoch": 0.8599085365853658, + "grad_norm": 0.1462948853043617, + "kl": 0.06494140625, + "learning_rate": 1.7580350875881411e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5641 + }, + { + "completion_length": 1390.666748046875, + "epoch": 0.8600609756097561, + "grad_norm": 0.07196697158235127, + "kl": 0.0540771484375, + "learning_rate": 1.7542875367906441e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5642 + }, + { + "completion_length": 2492.8333740234375, + "epoch": 0.8602134146341464, + "grad_norm": 0.06120627572191805, + "kl": 0.041748046875, + "learning_rate": 1.750543736434343e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5643 + }, + { + "completion_length": 2372.5000610351562, + "epoch": 0.8603658536585366, + "grad_norm": 0.05636659267659327, + "kl": 0.0447998046875, + "learning_rate": 1.7468036875792747e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5644 + }, + { + "completion_length": 1291.0000610351562, + "epoch": 0.8605182926829268, + "grad_norm": 0.14361195086122225, + "kl": 0.055419921875, + "learning_rate": 1.7430673912844075e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5645 + }, + { + "completion_length": 792.0000305175781, + "epoch": 0.8606707317073171, + "grad_norm": 0.4703625085444251, + "kl": 0.0906982421875, + "learning_rate": 1.7393348486076582e-07, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5646 + }, + { + "completion_length": 1099.3333740234375, + "epoch": 0.8608231707317073, + "grad_norm": 0.15854674223255072, + "kl": 0.077880859375, + "learning_rate": 1.73560606060586e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5647 + }, + { + "completion_length": 1331.8333740234375, + "epoch": 0.8609756097560975, + "grad_norm": 1.5822649212772295, + "kl": 0.05078125, + "learning_rate": 1.731881028334808e-07, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5648 + }, + { + "completion_length": 1293.3333740234375, + "epoch": 0.8611280487804878, + "grad_norm": 0.09315936813047218, + "kl": 0.051025390625, + "learning_rate": 1.728159752849217e-07, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5649 + }, + { + "completion_length": 1183.0000610351562, + "epoch": 0.8612804878048781, + "grad_norm": 0.09623457796872036, + "kl": 0.0511474609375, + "learning_rate": 1.724442235202748e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5650 + }, + { + "completion_length": 2074.1666870117188, + "epoch": 0.8614329268292683, + "grad_norm": 0.06528553100923758, + "kl": 0.043212890625, + "learning_rate": 1.7207284764479847e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5651 + }, + { + "completion_length": 2268.166748046875, + "epoch": 0.8615853658536585, + "grad_norm": 0.07069057550110858, + "kl": 0.049072265625, + "learning_rate": 1.71701847763646e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5652 + }, + { + "completion_length": 1315.6666717529297, + "epoch": 0.8617378048780487, + "grad_norm": 0.15628910491160142, + "kl": 0.095703125, + "learning_rate": 1.7133122398186352e-07, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5653 + }, + { + "completion_length": 2836.8333740234375, + "epoch": 0.8618902439024391, + "grad_norm": 0.049542796651562505, + "kl": 0.04248046875, + "learning_rate": 1.709609764043909e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5654 + }, + { + "completion_length": 1451.0000305175781, + "epoch": 0.8620426829268293, + "grad_norm": 0.07195353675769597, + "kl": 0.0523681640625, + "learning_rate": 1.705911051360618e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5655 + }, + { + "completion_length": 1457.3334350585938, + "epoch": 0.8621951219512195, + "grad_norm": 0.07619689879697744, + "kl": 0.0535888671875, + "learning_rate": 1.7022161028160244e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5656 + }, + { + "completion_length": 2097.166717529297, + "epoch": 0.8623475609756097, + "grad_norm": 1.3888115197641981, + "kl": 0.0460205078125, + "learning_rate": 1.6985249194563313e-07, + "loss": 0.0018, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5657 + }, + { + "completion_length": 2461.83349609375, + "epoch": 0.8625, + "grad_norm": 0.07160928997490484, + "kl": 0.05126953125, + "learning_rate": 1.6948375023266743e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5658 + }, + { + "completion_length": 1657.8333435058594, + "epoch": 0.8626524390243903, + "grad_norm": 2.097984905453883, + "kl": 0.0654296875, + "learning_rate": 1.691153852471127e-07, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5659 + }, + { + "completion_length": 1412.6666870117188, + "epoch": 0.8628048780487805, + "grad_norm": 0.09062855908705231, + "kl": 0.064453125, + "learning_rate": 1.6874739709326858e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5660 + }, + { + "completion_length": 1189.1667175292969, + "epoch": 0.8629573170731707, + "grad_norm": 0.11559264458668823, + "kl": 0.0653076171875, + "learning_rate": 1.6837978587532881e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5661 + }, + { + "completion_length": 725.6666870117188, + "epoch": 0.863109756097561, + "grad_norm": 1.9687020861510802, + "kl": 0.0732421875, + "learning_rate": 1.6801255169738027e-07, + "loss": 0.0029, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5662 + }, + { + "completion_length": 980.0000305175781, + "epoch": 0.8632621951219512, + "grad_norm": 0.09257348488985565, + "kl": 0.0477294921875, + "learning_rate": 1.6764569466340313e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5663 + }, + { + "completion_length": 820.6666870117188, + "epoch": 0.8634146341463415, + "grad_norm": 0.1636730464006787, + "kl": 0.079345703125, + "learning_rate": 1.6727921487727095e-07, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5664 + }, + { + "completion_length": 894.8333435058594, + "epoch": 0.8635670731707317, + "grad_norm": 0.1285880700188196, + "kl": 0.070556640625, + "learning_rate": 1.6691311244274938e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5665 + }, + { + "completion_length": 2316.8334350585938, + "epoch": 0.863719512195122, + "grad_norm": 0.061767116197809106, + "kl": 0.0538330078125, + "learning_rate": 1.6654738746349835e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5666 + }, + { + "completion_length": 1324.6666870117188, + "epoch": 0.8638719512195122, + "grad_norm": 0.08157879574761293, + "kl": 0.049072265625, + "learning_rate": 1.661820400430712e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5667 + }, + { + "completion_length": 1991.1666870117188, + "epoch": 0.8640243902439024, + "grad_norm": 0.11552661136270674, + "kl": 0.06298828125, + "learning_rate": 1.658170702849135e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5668 + }, + { + "completion_length": 1496.8333435058594, + "epoch": 0.8641768292682926, + "grad_norm": 1.1823614671743887, + "kl": 0.05029296875, + "learning_rate": 1.6545247829236394e-07, + "loss": 0.002, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5669 + }, + { + "completion_length": 2368.666748046875, + "epoch": 0.864329268292683, + "grad_norm": 0.05957425750556642, + "kl": 0.052978515625, + "learning_rate": 1.650882641686544e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5670 + }, + { + "completion_length": 1730.8333740234375, + "epoch": 0.8644817073170732, + "grad_norm": 0.12809355906149852, + "kl": 0.0614013671875, + "learning_rate": 1.6472442801691012e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5671 + }, + { + "completion_length": 1167.5, + "epoch": 0.8646341463414634, + "grad_norm": 0.10102842387550871, + "kl": 0.0552978515625, + "learning_rate": 1.64360969940149e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5672 + }, + { + "completion_length": 906.6666870117188, + "epoch": 0.8647865853658536, + "grad_norm": 0.08407068131222654, + "kl": 0.03643798828125, + "learning_rate": 1.6399789004128217e-07, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5673 + }, + { + "completion_length": 3111.666748046875, + "epoch": 0.864939024390244, + "grad_norm": 0.05260369483881798, + "kl": 0.0572509765625, + "learning_rate": 1.6363518842311298e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5674 + }, + { + "completion_length": 1150.1666870117188, + "epoch": 0.8650914634146342, + "grad_norm": 0.10424925961034763, + "kl": 0.060302734375, + "learning_rate": 1.6327286518833822e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5675 + }, + { + "completion_length": 1888.6666870117188, + "epoch": 0.8652439024390244, + "grad_norm": 0.12974627705750827, + "kl": 0.053955078125, + "learning_rate": 1.6291092043954752e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5676 + }, + { + "completion_length": 1514.3333740234375, + "epoch": 0.8653963414634146, + "grad_norm": 1.1987113791351947, + "kl": 0.050048828125, + "learning_rate": 1.6254935427922374e-07, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5677 + }, + { + "completion_length": 1776.666748046875, + "epoch": 0.8655487804878049, + "grad_norm": 1.9918265836470408, + "kl": 0.060546875, + "learning_rate": 1.6218816680974146e-07, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5678 + }, + { + "completion_length": 1513.166748046875, + "epoch": 0.8657012195121951, + "grad_norm": 1.817324916914736, + "kl": 0.0576171875, + "learning_rate": 1.618273581333689e-07, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5679 + }, + { + "completion_length": 1163.1666870117188, + "epoch": 0.8658536585365854, + "grad_norm": 1.1277159088724424, + "kl": 0.0675048828125, + "learning_rate": 1.6146692835226669e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5680 + }, + { + "completion_length": 838.0000305175781, + "epoch": 0.8660060975609756, + "grad_norm": 0.08885556035822068, + "kl": 0.0631103515625, + "learning_rate": 1.6110687756848846e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5681 + }, + { + "completion_length": 714.0, + "epoch": 0.8661585365853659, + "grad_norm": 1.6563744380729537, + "kl": 0.076904296875, + "learning_rate": 1.6074720588398056e-07, + "loss": 0.0031, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5682 + }, + { + "completion_length": 986.3333435058594, + "epoch": 0.8663109756097561, + "grad_norm": 0.11349696284105437, + "kl": 0.0482177734375, + "learning_rate": 1.6038791340058133e-07, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5683 + }, + { + "completion_length": 889.6667175292969, + "epoch": 0.8664634146341463, + "grad_norm": 0.07061730916913515, + "kl": 0.02691650390625, + "learning_rate": 1.6002900022002193e-07, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5684 + }, + { + "completion_length": 2782.666748046875, + "epoch": 0.8666158536585366, + "grad_norm": 0.05473381514666025, + "kl": 0.0462646484375, + "learning_rate": 1.5967046644392734e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5685 + }, + { + "completion_length": 1404.5000610351562, + "epoch": 0.8667682926829269, + "grad_norm": 0.11740741474174161, + "kl": 0.0592041015625, + "learning_rate": 1.5931231217381392e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5686 + }, + { + "completion_length": 3133.5, + "epoch": 0.8669207317073171, + "grad_norm": 0.06309941859577517, + "kl": 0.0556640625, + "learning_rate": 1.5895453751109018e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5687 + }, + { + "completion_length": 2809.666748046875, + "epoch": 0.8670731707317073, + "grad_norm": 0.06909604665294286, + "kl": 0.052734375, + "learning_rate": 1.5859714255705843e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5688 + }, + { + "completion_length": 1710.6666870117188, + "epoch": 0.8672256097560975, + "grad_norm": 0.09716835324405085, + "kl": 0.06396484375, + "learning_rate": 1.5824012741291255e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5689 + }, + { + "completion_length": 1511.0000610351562, + "epoch": 0.8673780487804879, + "grad_norm": 0.0912739194208347, + "kl": 0.0521240234375, + "learning_rate": 1.5788349217973951e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5690 + }, + { + "completion_length": 1166.1666870117188, + "epoch": 0.8675304878048781, + "grad_norm": 0.18952489382110127, + "kl": 0.0670166015625, + "learning_rate": 1.5752723695851772e-07, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5691 + }, + { + "completion_length": 2029.3334350585938, + "epoch": 0.8676829268292683, + "grad_norm": 0.05940919113786309, + "kl": 0.0498046875, + "learning_rate": 1.57171361850119e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5692 + }, + { + "completion_length": 2796.666748046875, + "epoch": 0.8678353658536585, + "grad_norm": 0.05357582006472892, + "kl": 0.0472412109375, + "learning_rate": 1.568158669553072e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5693 + }, + { + "completion_length": 2584.666748046875, + "epoch": 0.8679878048780488, + "grad_norm": 2.1651331468419124, + "kl": 0.0537109375, + "learning_rate": 1.564607523747385e-07, + "loss": 0.0022, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5694 + }, + { + "completion_length": 2043.5, + "epoch": 0.868140243902439, + "grad_norm": 0.08906836970619265, + "kl": 0.060302734375, + "learning_rate": 1.5610601820896165e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5695 + }, + { + "completion_length": 2146.1666870117188, + "epoch": 0.8682926829268293, + "grad_norm": 0.09734597526145618, + "kl": 0.0479736328125, + "learning_rate": 1.5575166455841677e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5696 + }, + { + "completion_length": 1333.6666870117188, + "epoch": 0.8684451219512195, + "grad_norm": 0.09109093590257582, + "kl": 0.0477294921875, + "learning_rate": 1.5539769152343731e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5697 + }, + { + "completion_length": 3185.666748046875, + "epoch": 0.8685975609756098, + "grad_norm": 0.050605664455192094, + "kl": 0.0570068359375, + "learning_rate": 1.550440992042484e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5698 + }, + { + "completion_length": 1475.0000305175781, + "epoch": 0.86875, + "grad_norm": 0.09779049956996859, + "kl": 0.0504150390625, + "learning_rate": 1.5469088770096763e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5699 + }, + { + "completion_length": 867.3333435058594, + "epoch": 0.8689024390243902, + "grad_norm": 0.09096709440767958, + "kl": 0.0469970703125, + "learning_rate": 1.5433805711360484e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5700 + }, + { + "completion_length": 2746.166748046875, + "epoch": 0.8690548780487805, + "grad_norm": 0.10931008534079123, + "kl": 0.0574951171875, + "learning_rate": 1.5398560754206127e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5701 + }, + { + "completion_length": 1221.3333740234375, + "epoch": 0.8692073170731708, + "grad_norm": 1.872568391036442, + "kl": 0.053466796875, + "learning_rate": 1.5363353908613108e-07, + "loss": 0.0021, + "reward": 0.8333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/format_reward": 0.0, + "step": 5702 + }, + { + "completion_length": 975.1666870117188, + "epoch": 0.869359756097561, + "grad_norm": 0.15084319874549595, + "kl": 0.0565185546875, + "learning_rate": 1.5328185184550027e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5703 + }, + { + "completion_length": 1568.8333740234375, + "epoch": 0.8695121951219512, + "grad_norm": 0.1482759311064583, + "kl": 0.066650390625, + "learning_rate": 1.5293054591974726e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5704 + }, + { + "completion_length": 1337.0000610351562, + "epoch": 0.8696646341463414, + "grad_norm": 0.10384429180236442, + "kl": 0.0543212890625, + "learning_rate": 1.525796214083417e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5705 + }, + { + "completion_length": 2806.0001220703125, + "epoch": 0.8698170731707318, + "grad_norm": 0.3239362930808143, + "kl": 0.048828125, + "learning_rate": 1.5222907841064575e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5706 + }, + { + "completion_length": 2457.166748046875, + "epoch": 0.869969512195122, + "grad_norm": 0.05461794282020162, + "kl": 0.05419921875, + "learning_rate": 1.5187891702591338e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5707 + }, + { + "completion_length": 1132.5, + "epoch": 0.8701219512195122, + "grad_norm": 0.1079333878736045, + "kl": 0.0474853515625, + "learning_rate": 1.5152913735329128e-07, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5708 + }, + { + "completion_length": 2192.166748046875, + "epoch": 0.8702743902439024, + "grad_norm": 0.0692428264990252, + "kl": 0.041748046875, + "learning_rate": 1.511797394918165e-07, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5709 + }, + { + "completion_length": 1473.3333740234375, + "epoch": 0.8704268292682927, + "grad_norm": 0.077518018655668, + "kl": 0.0584716796875, + "learning_rate": 1.508307235404193e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5710 + }, + { + "completion_length": 1862.5000610351562, + "epoch": 0.870579268292683, + "grad_norm": 1.2490305452589794, + "kl": 0.0604248046875, + "learning_rate": 1.5048208959792131e-07, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5711 + }, + { + "completion_length": 2183.5000610351562, + "epoch": 0.8707317073170732, + "grad_norm": 0.07318212780730443, + "kl": 0.051025390625, + "learning_rate": 1.501338377630362e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5712 + }, + { + "completion_length": 1436.0000610351562, + "epoch": 0.8708841463414634, + "grad_norm": 0.06107373314934676, + "kl": 0.048095703125, + "learning_rate": 1.497859681343694e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5713 + }, + { + "completion_length": 1312.0000610351562, + "epoch": 0.8710365853658537, + "grad_norm": 0.0776947908048117, + "kl": 0.0556640625, + "learning_rate": 1.494384808104175e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5714 + }, + { + "completion_length": 1747.3333740234375, + "epoch": 0.8711890243902439, + "grad_norm": 0.060823690340133366, + "kl": 0.0455322265625, + "learning_rate": 1.4909137588956978e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5715 + }, + { + "completion_length": 1463.1667175292969, + "epoch": 0.8713414634146341, + "grad_norm": 1.419866009646584, + "kl": 0.062255859375, + "learning_rate": 1.4874465347010663e-07, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5716 + }, + { + "completion_length": 2484.3334350585938, + "epoch": 0.8714939024390244, + "grad_norm": 0.17933188599694455, + "kl": 0.0540771484375, + "learning_rate": 1.4839831365020068e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5717 + }, + { + "completion_length": 1574.6666870117188, + "epoch": 0.8716463414634147, + "grad_norm": 0.06892483904810999, + "kl": 0.0439453125, + "learning_rate": 1.4805235652791599e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5718 + }, + { + "completion_length": 2026.0000610351562, + "epoch": 0.8717987804878049, + "grad_norm": 0.08274123964379836, + "kl": 0.057373046875, + "learning_rate": 1.4770678220120753e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5719 + }, + { + "completion_length": 2667.5001220703125, + "epoch": 0.8719512195121951, + "grad_norm": 0.8523662708683187, + "kl": 0.04541015625, + "learning_rate": 1.473615907679229e-07, + "loss": 0.0018, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5720 + }, + { + "completion_length": 826.3333740234375, + "epoch": 0.8721036585365853, + "grad_norm": 1.8778875383207536, + "kl": 0.06591796875, + "learning_rate": 1.47016782325801e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5721 + }, + { + "completion_length": 2488.5, + "epoch": 0.8722560975609757, + "grad_norm": 0.07309560436565753, + "kl": 0.052978515625, + "learning_rate": 1.46672356972472e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5722 + }, + { + "completion_length": 3794.166748046875, + "epoch": 0.8724085365853659, + "grad_norm": 0.03524410654510019, + "kl": 0.0386962890625, + "learning_rate": 1.4632831480545783e-07, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5723 + }, + { + "completion_length": 1200.666748046875, + "epoch": 0.8725609756097561, + "grad_norm": 0.07594506363795599, + "kl": 0.0367431640625, + "learning_rate": 1.459846559221721e-07, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5724 + }, + { + "completion_length": 2219.8334350585938, + "epoch": 0.8727134146341463, + "grad_norm": 0.06078234263095092, + "kl": 0.0479736328125, + "learning_rate": 1.4564138041991942e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5725 + }, + { + "completion_length": 1324.0000610351562, + "epoch": 0.8728658536585366, + "grad_norm": 0.0674848564359084, + "kl": 0.05419921875, + "learning_rate": 1.452984883958966e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5726 + }, + { + "completion_length": 926.1666870117188, + "epoch": 0.8730182926829269, + "grad_norm": 0.10800443589620057, + "kl": 0.0546875, + "learning_rate": 1.4495597994719078e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5727 + }, + { + "completion_length": 1286.8333740234375, + "epoch": 0.8731707317073171, + "grad_norm": 0.07851186935489934, + "kl": 0.0589599609375, + "learning_rate": 1.446138551707814e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5728 + }, + { + "completion_length": 1437.166748046875, + "epoch": 0.8733231707317073, + "grad_norm": 0.06082191507463826, + "kl": 0.0423583984375, + "learning_rate": 1.442721141635388e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5729 + }, + { + "completion_length": 2931.3333740234375, + "epoch": 0.8734756097560976, + "grad_norm": 0.046462795940328004, + "kl": 0.052734375, + "learning_rate": 1.439307570222249e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5730 + }, + { + "completion_length": 1574.166748046875, + "epoch": 0.8736280487804878, + "grad_norm": 0.07476521150873576, + "kl": 0.049560546875, + "learning_rate": 1.4358978384349326e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5731 + }, + { + "completion_length": 1999.166748046875, + "epoch": 0.873780487804878, + "grad_norm": 0.08310881576268829, + "kl": 0.04974365234375, + "learning_rate": 1.432491947238876e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5732 + }, + { + "completion_length": 1273.5000610351562, + "epoch": 0.8739329268292683, + "grad_norm": 0.12419328248546334, + "kl": 0.0621337890625, + "learning_rate": 1.4290898975984397e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5733 + }, + { + "completion_length": 669.5000305175781, + "epoch": 0.8740853658536586, + "grad_norm": 0.1383882376573919, + "kl": 0.08154296875, + "learning_rate": 1.425691690476893e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5734 + }, + { + "completion_length": 2425.0, + "epoch": 0.8742378048780488, + "grad_norm": 0.07505805540716594, + "kl": 0.0439453125, + "learning_rate": 1.422297326836418e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5735 + }, + { + "completion_length": 1239.6666870117188, + "epoch": 0.874390243902439, + "grad_norm": 1.5044479155068264, + "kl": 0.056640625, + "learning_rate": 1.4189068076381078e-07, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5736 + }, + { + "completion_length": 1159.3333740234375, + "epoch": 0.8745426829268292, + "grad_norm": 1.7450671292240347, + "kl": 0.051513671875, + "learning_rate": 1.4155201338419643e-07, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5737 + }, + { + "completion_length": 2073.0000610351562, + "epoch": 0.8746951219512196, + "grad_norm": 0.13342634003371315, + "kl": 0.048095703125, + "learning_rate": 1.4121373064069038e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5738 + }, + { + "completion_length": 1342.0000915527344, + "epoch": 0.8748475609756098, + "grad_norm": 0.20076034396276446, + "kl": 0.0447998046875, + "learning_rate": 1.4087583262907533e-07, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5739 + }, + { + "completion_length": 1519.166748046875, + "epoch": 0.875, + "grad_norm": 0.07186547877606672, + "kl": 0.059326171875, + "learning_rate": 1.405383194450251e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5740 + }, + { + "completion_length": 1124.0, + "epoch": 0.8751524390243902, + "grad_norm": 0.07859437807265902, + "kl": 0.0552978515625, + "learning_rate": 1.4020119118410441e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5741 + }, + { + "completion_length": 1540.5, + "epoch": 0.8753048780487804, + "grad_norm": 2.371511061878838, + "kl": 0.091064453125, + "learning_rate": 1.398644479417689e-07, + "loss": 0.0036, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 5742 + }, + { + "completion_length": 1407.1666870117188, + "epoch": 0.8754573170731708, + "grad_norm": 0.06076969192394484, + "kl": 0.0341796875, + "learning_rate": 1.3952808981336546e-07, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5743 + }, + { + "completion_length": 1522.6667175292969, + "epoch": 0.875609756097561, + "grad_norm": 0.10415362635426641, + "kl": 0.046875, + "learning_rate": 1.3919211689413207e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5744 + }, + { + "completion_length": 2012.1666870117188, + "epoch": 0.8757621951219512, + "grad_norm": 1.69827812772698, + "kl": 0.065673828125, + "learning_rate": 1.3885652927919685e-07, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5745 + }, + { + "completion_length": 2162.5000610351562, + "epoch": 0.8759146341463414, + "grad_norm": 0.092469766949641, + "kl": 0.0460205078125, + "learning_rate": 1.3852132706357956e-07, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5746 + }, + { + "completion_length": 2174.3333740234375, + "epoch": 0.8760670731707317, + "grad_norm": 0.07028585484001634, + "kl": 0.0694580078125, + "learning_rate": 1.3818651034219066e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5747 + }, + { + "completion_length": 1713.5000610351562, + "epoch": 0.876219512195122, + "grad_norm": 0.08309316921310428, + "kl": 0.0623779296875, + "learning_rate": 1.3785207920983145e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5748 + }, + { + "completion_length": 1370.5000610351562, + "epoch": 0.8763719512195122, + "grad_norm": 0.07749159739276831, + "kl": 0.0509033203125, + "learning_rate": 1.375180337611942e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5749 + }, + { + "completion_length": 1802.8333740234375, + "epoch": 0.8765243902439024, + "grad_norm": 0.08859512435879698, + "kl": 0.0577392578125, + "learning_rate": 1.3718437409086144e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5750 + }, + { + "completion_length": 1781.0000915527344, + "epoch": 0.8766768292682927, + "grad_norm": 0.10863405509524927, + "kl": 0.0538330078125, + "learning_rate": 1.368511002933069e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5751 + }, + { + "completion_length": 1814.6667175292969, + "epoch": 0.8768292682926829, + "grad_norm": 0.07619549109372081, + "kl": 0.0582275390625, + "learning_rate": 1.365182124628949e-07, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5752 + }, + { + "completion_length": 1637.1666870117188, + "epoch": 0.8769817073170731, + "grad_norm": 0.10797231926774242, + "kl": 0.0595703125, + "learning_rate": 1.361857106938813e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5753 + }, + { + "completion_length": 3113.3333740234375, + "epoch": 0.8771341463414634, + "grad_norm": 0.8864879035994693, + "kl": 0.048095703125, + "learning_rate": 1.358535950804109e-07, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5754 + }, + { + "completion_length": 988.3333435058594, + "epoch": 0.8772865853658537, + "grad_norm": 0.10387649152011394, + "kl": 0.0640869140625, + "learning_rate": 1.3552186571652066e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5755 + }, + { + "completion_length": 1307.8333435058594, + "epoch": 0.8774390243902439, + "grad_norm": 0.09759880525481067, + "kl": 0.072998046875, + "learning_rate": 1.3519052269613757e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5756 + }, + { + "completion_length": 1970.8334350585938, + "epoch": 0.8775914634146341, + "grad_norm": 0.07468802977774375, + "kl": 0.0592041015625, + "learning_rate": 1.3485956611307947e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5757 + }, + { + "completion_length": 2120.8333740234375, + "epoch": 0.8777439024390243, + "grad_norm": 0.0571854600964868, + "kl": 0.056884765625, + "learning_rate": 1.3452899606105473e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5758 + }, + { + "completion_length": 2474.0, + "epoch": 0.8778963414634147, + "grad_norm": 0.08160308610662248, + "kl": 0.0548095703125, + "learning_rate": 1.3419881263366196e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5759 + }, + { + "completion_length": 969.5, + "epoch": 0.8780487804878049, + "grad_norm": 0.18183750770687704, + "kl": 0.0467529296875, + "learning_rate": 1.3386901592439071e-07, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5760 + }, + { + "completion_length": 1489.666748046875, + "epoch": 0.8782012195121951, + "grad_norm": 0.13005601463453445, + "kl": 0.05322265625, + "learning_rate": 1.3353960602662097e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5761 + }, + { + "completion_length": 1168.6666870117188, + "epoch": 0.8783536585365853, + "grad_norm": 0.07302413037554284, + "kl": 0.0465087890625, + "learning_rate": 1.332105830336231e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5762 + }, + { + "completion_length": 1442.5000610351562, + "epoch": 0.8785060975609756, + "grad_norm": 0.07730484792840393, + "kl": 0.0440673828125, + "learning_rate": 1.3288194703855745e-07, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5763 + }, + { + "completion_length": 943.3333435058594, + "epoch": 0.8786585365853659, + "grad_norm": 1.8059967028798687, + "kl": 0.06658935546875, + "learning_rate": 1.3255369813447572e-07, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5764 + }, + { + "completion_length": 963.3333740234375, + "epoch": 0.8788109756097561, + "grad_norm": 0.12899179670888866, + "kl": 0.0562744140625, + "learning_rate": 1.322258364143194e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5765 + }, + { + "completion_length": 2005.5001220703125, + "epoch": 0.8789634146341463, + "grad_norm": 0.12955471212302247, + "kl": 0.0679931640625, + "learning_rate": 1.3189836197092043e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5766 + }, + { + "completion_length": 1138.6666870117188, + "epoch": 0.8791158536585366, + "grad_norm": 0.0784702783094252, + "kl": 0.04052734375, + "learning_rate": 1.3157127489700177e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5767 + }, + { + "completion_length": 2263.166748046875, + "epoch": 0.8792682926829268, + "grad_norm": 0.842767083278039, + "kl": 0.1328125, + "learning_rate": 1.3124457528517503e-07, + "loss": 0.0053, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5768 + }, + { + "completion_length": 2706.0001220703125, + "epoch": 0.879420731707317, + "grad_norm": 1.3159410249221069, + "kl": 0.0521240234375, + "learning_rate": 1.3091826322794388e-07, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5769 + }, + { + "completion_length": 3421.166748046875, + "epoch": 0.8795731707317073, + "grad_norm": 0.9039884802140594, + "kl": 0.0394287109375, + "learning_rate": 1.3059233881770138e-07, + "loss": 0.0016, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5770 + }, + { + "completion_length": 735.3333435058594, + "epoch": 0.8797256097560976, + "grad_norm": 0.10535517570265475, + "kl": 0.05224609375, + "learning_rate": 1.3026680214673142e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5771 + }, + { + "completion_length": 1307.3333740234375, + "epoch": 0.8798780487804878, + "grad_norm": 0.14510470819586077, + "kl": 0.077880859375, + "learning_rate": 1.2994165330720675e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5772 + }, + { + "completion_length": 1122.0000610351562, + "epoch": 0.880030487804878, + "grad_norm": 0.162633249919288, + "kl": 0.08740234375, + "learning_rate": 1.2961689239119208e-07, + "loss": 0.0035, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5773 + }, + { + "completion_length": 1970.1666870117188, + "epoch": 0.8801829268292682, + "grad_norm": 0.070871826040898, + "kl": 0.050048828125, + "learning_rate": 1.2929251949064097e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5774 + }, + { + "completion_length": 810.1666870117188, + "epoch": 0.8803353658536586, + "grad_norm": 0.11828338599779353, + "kl": 0.0582275390625, + "learning_rate": 1.2896853469739783e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5775 + }, + { + "completion_length": 692.1666870117188, + "epoch": 0.8804878048780488, + "grad_norm": 1.954934687016129, + "kl": 0.069091796875, + "learning_rate": 1.2864493810319676e-07, + "loss": 0.0028, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5776 + }, + { + "completion_length": 1066.6667175292969, + "epoch": 0.880640243902439, + "grad_norm": 0.09439892568868938, + "kl": 0.057861328125, + "learning_rate": 1.2832172979966234e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5777 + }, + { + "completion_length": 1147.0000610351562, + "epoch": 0.8807926829268292, + "grad_norm": 0.09342549228535149, + "kl": 0.0460205078125, + "learning_rate": 1.2799890987830897e-07, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5778 + }, + { + "completion_length": 1192.0000610351562, + "epoch": 0.8809451219512195, + "grad_norm": 0.11029421203808477, + "kl": 0.061279296875, + "learning_rate": 1.2767647843054092e-07, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5779 + }, + { + "completion_length": 1342.0, + "epoch": 0.8810975609756098, + "grad_norm": 0.1807108564544196, + "kl": 0.06298828125, + "learning_rate": 1.2735443554765313e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5780 + }, + { + "completion_length": 1077.1667175292969, + "epoch": 0.88125, + "grad_norm": 0.3893267526250358, + "kl": 0.076171875, + "learning_rate": 1.2703278132082934e-07, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5781 + }, + { + "completion_length": 1840.5, + "epoch": 0.8814024390243902, + "grad_norm": 0.059539428779291276, + "kl": 0.0531005859375, + "learning_rate": 1.2671151584114437e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5782 + }, + { + "completion_length": 1114.3333740234375, + "epoch": 0.8815548780487805, + "grad_norm": 0.08624812533592699, + "kl": 0.0535888671875, + "learning_rate": 1.2639063919956228e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5783 + }, + { + "completion_length": 1518.3333435058594, + "epoch": 0.8817073170731707, + "grad_norm": 0.08922840284834747, + "kl": 0.061767578125, + "learning_rate": 1.260701514869379e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5784 + }, + { + "completion_length": 1488.0000610351562, + "epoch": 0.881859756097561, + "grad_norm": 0.09709956246006263, + "kl": 0.0567626953125, + "learning_rate": 1.2575005279401464e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5785 + }, + { + "completion_length": 1507.8333435058594, + "epoch": 0.8820121951219512, + "grad_norm": 1.1844214472127974, + "kl": 0.04443359375, + "learning_rate": 1.2543034321142666e-07, + "loss": 0.0018, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5786 + }, + { + "completion_length": 2040.666748046875, + "epoch": 0.8821646341463415, + "grad_norm": 0.31808584651933086, + "kl": 0.0616455078125, + "learning_rate": 1.2511102282969788e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5787 + }, + { + "completion_length": 1368.6666870117188, + "epoch": 0.8823170731707317, + "grad_norm": 2.440952499271268, + "kl": 0.068603515625, + "learning_rate": 1.2479209173924182e-07, + "loss": 0.0028, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5788 + }, + { + "completion_length": 1410.6667175292969, + "epoch": 0.8824695121951219, + "grad_norm": 0.12542546922611114, + "kl": 0.065673828125, + "learning_rate": 1.244735500303621e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5789 + }, + { + "completion_length": 1727.5000610351562, + "epoch": 0.8826219512195121, + "grad_norm": 0.11695060281595555, + "kl": 0.064208984375, + "learning_rate": 1.241553977932512e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5790 + }, + { + "completion_length": 771.5000305175781, + "epoch": 0.8827743902439025, + "grad_norm": 0.12386992231736306, + "kl": 0.03704833984375, + "learning_rate": 1.238376351179924e-07, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5791 + }, + { + "completion_length": 2017.0, + "epoch": 0.8829268292682927, + "grad_norm": 0.08144472606533906, + "kl": 0.0518798828125, + "learning_rate": 1.2352026209455808e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5792 + }, + { + "completion_length": 1422.0, + "epoch": 0.8830792682926829, + "grad_norm": 0.10767896520824088, + "kl": 0.0438232421875, + "learning_rate": 1.232032788128105e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5793 + }, + { + "completion_length": 2175.0001220703125, + "epoch": 0.8832317073170731, + "grad_norm": 0.07917637834748627, + "kl": 0.0557861328125, + "learning_rate": 1.2288668536250175e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5794 + }, + { + "completion_length": 1646.6666870117188, + "epoch": 0.8833841463414634, + "grad_norm": 2.2024174718086624, + "kl": 0.064208984375, + "learning_rate": 1.2257048183327257e-07, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5795 + }, + { + "completion_length": 1841.6666870117188, + "epoch": 0.8835365853658537, + "grad_norm": 0.11252176160039115, + "kl": 0.0830078125, + "learning_rate": 1.2225466831465486e-07, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5796 + }, + { + "completion_length": 966.6667175292969, + "epoch": 0.8836890243902439, + "grad_norm": 0.13245946364118985, + "kl": 0.0570068359375, + "learning_rate": 1.2193924489606877e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5797 + }, + { + "completion_length": 1054.0000610351562, + "epoch": 0.8838414634146341, + "grad_norm": 0.08408719692881264, + "kl": 0.056640625, + "learning_rate": 1.2162421166682486e-07, + "loss": 0.0023, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 5798 + }, + { + "completion_length": 1667.3333740234375, + "epoch": 0.8839939024390244, + "grad_norm": 1.6689978752991868, + "kl": 0.059814453125, + "learning_rate": 1.2130956871612243e-07, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5799 + }, + { + "completion_length": 951.0, + "epoch": 0.8841463414634146, + "grad_norm": 0.09859357483222539, + "kl": 0.0535888671875, + "learning_rate": 1.209953161330507e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5800 + }, + { + "completion_length": 1489.5000610351562, + "epoch": 0.8842987804878049, + "grad_norm": 0.126441874572652, + "kl": 0.0556640625, + "learning_rate": 1.2068145400658854e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5801 + }, + { + "completion_length": 1959.3333740234375, + "epoch": 0.8844512195121951, + "grad_norm": 0.13849971341478343, + "kl": 0.0557861328125, + "learning_rate": 1.2036798242560414e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5802 + }, + { + "completion_length": 1410.5, + "epoch": 0.8846036585365854, + "grad_norm": 0.06133427572870618, + "kl": 0.03631591796875, + "learning_rate": 1.2005490147885473e-07, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5803 + }, + { + "completion_length": 1472.3333740234375, + "epoch": 0.8847560975609756, + "grad_norm": 0.06962912537253398, + "kl": 0.051513671875, + "learning_rate": 1.1974221125498734e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5804 + }, + { + "completion_length": 1118.3333435058594, + "epoch": 0.8849085365853658, + "grad_norm": 0.2441742661328426, + "kl": 0.0628662109375, + "learning_rate": 1.1942991184253838e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5805 + }, + { + "completion_length": 1143.8333740234375, + "epoch": 0.885060975609756, + "grad_norm": 0.1444052797309725, + "kl": 0.0615234375, + "learning_rate": 1.191180033299334e-07, + "loss": 0.0025, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 5806 + }, + { + "completion_length": 1132.1667175292969, + "epoch": 0.8852134146341464, + "grad_norm": 0.13376094840934644, + "kl": 0.073486328125, + "learning_rate": 1.1880648580548758e-07, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5807 + }, + { + "completion_length": 1092.0000610351562, + "epoch": 0.8853658536585366, + "grad_norm": 0.08441573750788324, + "kl": 0.0465087890625, + "learning_rate": 1.1849535935740474e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5808 + }, + { + "completion_length": 2200.8333740234375, + "epoch": 0.8855182926829268, + "grad_norm": 0.11028423392770119, + "kl": 0.056396484375, + "learning_rate": 1.1818462407377872e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5809 + }, + { + "completion_length": 1411.8333435058594, + "epoch": 0.885670731707317, + "grad_norm": 0.07148793214818666, + "kl": 0.047119140625, + "learning_rate": 1.1787428004259199e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5810 + }, + { + "completion_length": 1297.0000610351562, + "epoch": 0.8858231707317074, + "grad_norm": 0.09837410717226718, + "kl": 0.066650390625, + "learning_rate": 1.1756432735171691e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5811 + }, + { + "completion_length": 892.1666870117188, + "epoch": 0.8859756097560976, + "grad_norm": 0.10318865595181458, + "kl": 0.0543212890625, + "learning_rate": 1.1725476608891478e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5812 + }, + { + "completion_length": 1027.1666870117188, + "epoch": 0.8861280487804878, + "grad_norm": 0.1228552233355344, + "kl": 0.048583984375, + "learning_rate": 1.1694559634183499e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5813 + }, + { + "completion_length": 1078.0000305175781, + "epoch": 0.886280487804878, + "grad_norm": 0.1083074847892813, + "kl": 0.0599365234375, + "learning_rate": 1.1663681819801803e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5814 + }, + { + "completion_length": 1162.3333740234375, + "epoch": 0.8864329268292683, + "grad_norm": 1.7498611473889956, + "kl": 0.0546875, + "learning_rate": 1.1632843174489244e-07, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5815 + }, + { + "completion_length": 841.8333435058594, + "epoch": 0.8865853658536585, + "grad_norm": 0.07859269587300906, + "kl": 0.044921875, + "learning_rate": 1.1602043706977538e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5816 + }, + { + "completion_length": 849.8333740234375, + "epoch": 0.8867378048780488, + "grad_norm": 0.09858408557947954, + "kl": 0.0369873046875, + "learning_rate": 1.1571283425987378e-07, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5817 + }, + { + "completion_length": 1179.8333740234375, + "epoch": 0.886890243902439, + "grad_norm": 0.2190927603986992, + "kl": 0.0474853515625, + "learning_rate": 1.1540562340228378e-07, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5818 + }, + { + "completion_length": 1008.0000305175781, + "epoch": 0.8870426829268293, + "grad_norm": 0.15409575210843413, + "kl": 0.07763671875, + "learning_rate": 1.1509880458399014e-07, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5819 + }, + { + "completion_length": 2077.5000610351562, + "epoch": 0.8871951219512195, + "grad_norm": 0.08225667818838296, + "kl": 0.0609130859375, + "learning_rate": 1.147923778918667e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5820 + }, + { + "completion_length": 1476.5000610351562, + "epoch": 0.8873475609756097, + "grad_norm": 0.14329646436121785, + "kl": 0.073974609375, + "learning_rate": 1.1448634341267589e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5821 + }, + { + "completion_length": 1345.3333740234375, + "epoch": 0.8875, + "grad_norm": 0.1962963732445295, + "kl": 0.0657958984375, + "learning_rate": 1.141807012330699e-07, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5822 + }, + { + "completion_length": 930.1666870117188, + "epoch": 0.8876524390243903, + "grad_norm": 2.0736095567659962, + "kl": 0.0751953125, + "learning_rate": 1.1387545143958933e-07, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5823 + }, + { + "completion_length": 1285.8333740234375, + "epoch": 0.8878048780487805, + "grad_norm": 0.11907958750808262, + "kl": 0.054931640625, + "learning_rate": 1.1357059411866355e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5824 + }, + { + "completion_length": 998.8333435058594, + "epoch": 0.8879573170731707, + "grad_norm": 1.6610723064791866, + "kl": 0.0582275390625, + "learning_rate": 1.1326612935661152e-07, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5825 + }, + { + "completion_length": 937.8333740234375, + "epoch": 0.8881097560975609, + "grad_norm": 0.08585804793608863, + "kl": 0.048095703125, + "learning_rate": 1.1296205723963993e-07, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5826 + }, + { + "completion_length": 1542.5, + "epoch": 0.8882621951219513, + "grad_norm": 0.07907413846815219, + "kl": 0.0506591796875, + "learning_rate": 1.1265837785384526e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5827 + }, + { + "completion_length": 1777.5000610351562, + "epoch": 0.8884146341463415, + "grad_norm": 0.07205653703004844, + "kl": 0.0579833984375, + "learning_rate": 1.1235509128521221e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5828 + }, + { + "completion_length": 1120.0000610351562, + "epoch": 0.8885670731707317, + "grad_norm": 0.08448348129106704, + "kl": 0.039794921875, + "learning_rate": 1.1205219761961477e-07, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5829 + }, + { + "completion_length": 1943.0, + "epoch": 0.8887195121951219, + "grad_norm": 0.10598313640510472, + "kl": 0.051513671875, + "learning_rate": 1.117496969428155e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5830 + }, + { + "completion_length": 1898.0, + "epoch": 0.8888719512195122, + "grad_norm": 0.05658221250508569, + "kl": 0.0421142578125, + "learning_rate": 1.1144758934046522e-07, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5831 + }, + { + "completion_length": 2074.3334350585938, + "epoch": 0.8890243902439025, + "grad_norm": 1.564586156261203, + "kl": 0.0570068359375, + "learning_rate": 1.1114587489810352e-07, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5832 + }, + { + "completion_length": 1324.1666870117188, + "epoch": 0.8891768292682927, + "grad_norm": 0.10191104499039308, + "kl": 0.054443359375, + "learning_rate": 1.1084455370116003e-07, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5833 + }, + { + "completion_length": 864.3333435058594, + "epoch": 0.8893292682926829, + "grad_norm": 0.08521495679527034, + "kl": 0.0462646484375, + "learning_rate": 1.105436258349512e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5834 + }, + { + "completion_length": 1091.8333740234375, + "epoch": 0.8894817073170732, + "grad_norm": 0.10729026919288527, + "kl": 0.074462890625, + "learning_rate": 1.1024309138468286e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5835 + }, + { + "completion_length": 680.1666870117188, + "epoch": 0.8896341463414634, + "grad_norm": 0.507229606736407, + "kl": 0.065185546875, + "learning_rate": 1.0994295043544978e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5836 + }, + { + "completion_length": 1930.0000610351562, + "epoch": 0.8897865853658536, + "grad_norm": 1.8075648018109005, + "kl": 0.053955078125, + "learning_rate": 1.096432030722348e-07, + "loss": 0.0022, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5837 + }, + { + "completion_length": 2459.5000610351562, + "epoch": 0.8899390243902439, + "grad_norm": 0.07360080023337927, + "kl": 0.040771484375, + "learning_rate": 1.0934384937990988e-07, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5838 + }, + { + "completion_length": 1752.5000610351562, + "epoch": 0.8900914634146342, + "grad_norm": 0.10140749361971085, + "kl": 0.0650634765625, + "learning_rate": 1.090448894432347e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5839 + }, + { + "completion_length": 1071.0000305175781, + "epoch": 0.8902439024390244, + "grad_norm": 0.11125639870561958, + "kl": 0.061767578125, + "learning_rate": 1.0874632334685808e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5840 + }, + { + "completion_length": 810.3333435058594, + "epoch": 0.8903963414634146, + "grad_norm": 0.09168938635331832, + "kl": 0.0594482421875, + "learning_rate": 1.084481511753172e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5841 + }, + { + "completion_length": 1345.5, + "epoch": 0.8905487804878048, + "grad_norm": 0.12767398187654935, + "kl": 0.0628662109375, + "learning_rate": 1.0815037301303755e-07, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5842 + }, + { + "completion_length": 1730.6667175292969, + "epoch": 0.8907012195121952, + "grad_norm": 0.06849952223694837, + "kl": 0.0460205078125, + "learning_rate": 1.0785298894433349e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5843 + }, + { + "completion_length": 1840.3333740234375, + "epoch": 0.8908536585365854, + "grad_norm": 0.11761177306981228, + "kl": 0.064453125, + "learning_rate": 1.0755599905340701e-07, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5844 + }, + { + "completion_length": 1784.5000915527344, + "epoch": 0.8910060975609756, + "grad_norm": 0.09447728866401302, + "kl": 0.0489501953125, + "learning_rate": 1.072594034243492e-07, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5845 + }, + { + "completion_length": 865.5000305175781, + "epoch": 0.8911585365853658, + "grad_norm": 0.2567366804439559, + "kl": 0.0552978515625, + "learning_rate": 1.0696320214113919e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5846 + }, + { + "completion_length": 1594.3334350585938, + "epoch": 0.8913109756097561, + "grad_norm": 0.08382299021422171, + "kl": 0.0533447265625, + "learning_rate": 1.0666739528764474e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5847 + }, + { + "completion_length": 2053.5000610351562, + "epoch": 0.8914634146341464, + "grad_norm": 0.05939067776154364, + "kl": 0.0430908203125, + "learning_rate": 1.0637198294762152e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5848 + }, + { + "completion_length": 1334.8333740234375, + "epoch": 0.8916158536585366, + "grad_norm": 0.06633395776802316, + "kl": 0.0452880859375, + "learning_rate": 1.060769652047136e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5849 + }, + { + "completion_length": 2612.6666870117188, + "epoch": 0.8917682926829268, + "grad_norm": 0.04357471817831397, + "kl": 0.0439453125, + "learning_rate": 1.0578234214245336e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5850 + }, + { + "completion_length": 653.1666717529297, + "epoch": 0.8919207317073171, + "grad_norm": 0.15733705772020118, + "kl": 0.052490234375, + "learning_rate": 1.0548811384426221e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5851 + }, + { + "completion_length": 1224.0000610351562, + "epoch": 0.8920731707317073, + "grad_norm": 0.0993476758341599, + "kl": 0.0662841796875, + "learning_rate": 1.0519428039344836e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5852 + }, + { + "completion_length": 1777.1666870117188, + "epoch": 0.8922256097560975, + "grad_norm": 0.1071134743028472, + "kl": 0.05322265625, + "learning_rate": 1.0490084187320908e-07, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5853 + }, + { + "completion_length": 1902.0000610351562, + "epoch": 0.8923780487804878, + "grad_norm": 0.07303607600031456, + "kl": 0.05615234375, + "learning_rate": 1.046077983666296e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5854 + }, + { + "completion_length": 2339.3333740234375, + "epoch": 0.8925304878048781, + "grad_norm": 0.06523224701214234, + "kl": 0.0543212890625, + "learning_rate": 1.0431514995668351e-07, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5855 + }, + { + "completion_length": 905.3333740234375, + "epoch": 0.8926829268292683, + "grad_norm": 0.14832304127937346, + "kl": 0.070068359375, + "learning_rate": 1.0402289672623272e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5856 + }, + { + "completion_length": 1645.3333740234375, + "epoch": 0.8928353658536585, + "grad_norm": 0.08768341537983786, + "kl": 0.06103515625, + "learning_rate": 1.0373103875802619e-07, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5857 + }, + { + "completion_length": 1294.8333435058594, + "epoch": 0.8929878048780487, + "grad_norm": 0.08948587253092673, + "kl": 0.060791015625, + "learning_rate": 1.0343957613470201e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5858 + }, + { + "completion_length": 2390.5001220703125, + "epoch": 0.8931402439024391, + "grad_norm": 0.05954053963339573, + "kl": 0.0518798828125, + "learning_rate": 1.0314850893878613e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5859 + }, + { + "completion_length": 2235.3333435058594, + "epoch": 0.8932926829268293, + "grad_norm": 0.1898947990663558, + "kl": 0.0665283203125, + "learning_rate": 1.0285783725269232e-07, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5860 + }, + { + "completion_length": 1183.5, + "epoch": 0.8934451219512195, + "grad_norm": 0.1871155113437171, + "kl": 0.0594482421875, + "learning_rate": 1.0256756115872273e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5861 + }, + { + "completion_length": 1323.1666870117188, + "epoch": 0.8935975609756097, + "grad_norm": 0.06814288000033812, + "kl": 0.0516357421875, + "learning_rate": 1.0227768073906662e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5862 + }, + { + "completion_length": 2396.1666870117188, + "epoch": 0.89375, + "grad_norm": 0.0595427058588961, + "kl": 0.0416259765625, + "learning_rate": 1.0198819607580234e-07, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5863 + }, + { + "completion_length": 2818.666748046875, + "epoch": 0.8939024390243903, + "grad_norm": 0.045093189581088676, + "kl": 0.0447998046875, + "learning_rate": 1.0169910725089548e-07, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5864 + }, + { + "completion_length": 2405.5000610351562, + "epoch": 0.8940548780487805, + "grad_norm": 0.07147434759177493, + "kl": 0.05224609375, + "learning_rate": 1.0141041434619991e-07, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5865 + }, + { + "completion_length": 1519.8333740234375, + "epoch": 0.8942073170731707, + "grad_norm": 0.2692884719727374, + "kl": 0.0589599609375, + "learning_rate": 1.0112211744345706e-07, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5866 + }, + { + "completion_length": 1632.5000610351562, + "epoch": 0.894359756097561, + "grad_norm": 0.11921759366543884, + "kl": 0.0706787109375, + "learning_rate": 1.0083421662429632e-07, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5867 + }, + { + "completion_length": 1134.3333740234375, + "epoch": 0.8945121951219512, + "grad_norm": 0.1137601428192594, + "kl": 0.0758056640625, + "learning_rate": 1.005467119702353e-07, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5868 + }, + { + "completion_length": 1750.5000610351562, + "epoch": 0.8946646341463415, + "grad_norm": 0.09242472436163926, + "kl": 0.0572509765625, + "learning_rate": 1.0025960356267888e-07, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5869 + }, + { + "completion_length": 1274.3333740234375, + "epoch": 0.8948170731707317, + "grad_norm": 0.28980600033389076, + "kl": 0.0758056640625, + "learning_rate": 9.997289148292021e-08, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5870 + }, + { + "completion_length": 1668.0, + "epoch": 0.894969512195122, + "grad_norm": 1.1387164140994188, + "kl": 0.053466796875, + "learning_rate": 9.968657581214003e-08, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5871 + }, + { + "completion_length": 2171.5000915527344, + "epoch": 0.8951219512195122, + "grad_norm": 0.7414874977642637, + "kl": 0.03564453125, + "learning_rate": 9.940065663140663e-08, + "loss": 0.0014, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5872 + }, + { + "completion_length": 824.8333435058594, + "epoch": 0.8952743902439024, + "grad_norm": 0.09299583161226777, + "kl": 0.0616455078125, + "learning_rate": 9.91151340216766e-08, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5873 + }, + { + "completion_length": 2086.166748046875, + "epoch": 0.8954268292682926, + "grad_norm": 0.0626390265251743, + "kl": 0.0386962890625, + "learning_rate": 9.883000806379378e-08, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5874 + }, + { + "completion_length": 1039.8333740234375, + "epoch": 0.895579268292683, + "grad_norm": 0.11779892418355184, + "kl": 0.05224609375, + "learning_rate": 9.854527883848974e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5875 + }, + { + "completion_length": 1994.5000610351562, + "epoch": 0.8957317073170732, + "grad_norm": 0.049671452548757405, + "kl": 0.03765869140625, + "learning_rate": 9.82609464263835e-08, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5876 + }, + { + "completion_length": 1134.1666870117188, + "epoch": 0.8958841463414634, + "grad_norm": 0.13074604485257252, + "kl": 0.04931640625, + "learning_rate": 9.79770109079825e-08, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5877 + }, + { + "completion_length": 1380.0000610351562, + "epoch": 0.8960365853658536, + "grad_norm": 0.09651070323671854, + "kl": 0.0531005859375, + "learning_rate": 9.769347236368126e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5878 + }, + { + "completion_length": 2566.83349609375, + "epoch": 0.896189024390244, + "grad_norm": 0.05170562036615942, + "kl": 0.0458984375, + "learning_rate": 9.741033087376172e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5879 + }, + { + "completion_length": 967.3333740234375, + "epoch": 0.8963414634146342, + "grad_norm": 0.1730174990451042, + "kl": 0.0452880859375, + "learning_rate": 9.71275865183936e-08, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5880 + }, + { + "completion_length": 1810.8334350585938, + "epoch": 0.8964939024390244, + "grad_norm": 0.09668903032916096, + "kl": 0.0653076171875, + "learning_rate": 9.684523937763451e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5881 + }, + { + "completion_length": 1602.6666870117188, + "epoch": 0.8966463414634146, + "grad_norm": 0.17723913736967298, + "kl": 0.072265625, + "learning_rate": 9.6563289531429e-08, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5882 + }, + { + "completion_length": 2390.666748046875, + "epoch": 0.8967987804878049, + "grad_norm": 1.1319834584845094, + "kl": 0.037841796875, + "learning_rate": 9.628173705960975e-08, + "loss": 0.0015, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5883 + }, + { + "completion_length": 682.6666870117188, + "epoch": 0.8969512195121951, + "grad_norm": 0.14062576553679693, + "kl": 0.093017578125, + "learning_rate": 9.600058204189627e-08, + "loss": 0.0037, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5884 + }, + { + "completion_length": 1110.8333740234375, + "epoch": 0.8971036585365854, + "grad_norm": 1.6684628085469497, + "kl": 0.065673828125, + "learning_rate": 9.571982455789575e-08, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5885 + }, + { + "completion_length": 1179.0000305175781, + "epoch": 0.8972560975609756, + "grad_norm": 0.1730974501152915, + "kl": 0.071044921875, + "learning_rate": 9.54394646871034e-08, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5886 + }, + { + "completion_length": 1494.666748046875, + "epoch": 0.8974085365853659, + "grad_norm": 0.06578168693917047, + "kl": 0.0604248046875, + "learning_rate": 9.515950250890093e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5887 + }, + { + "completion_length": 1111.5000610351562, + "epoch": 0.8975609756097561, + "grad_norm": 0.0934898583552302, + "kl": 0.0673828125, + "learning_rate": 9.487993810255823e-08, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5888 + }, + { + "completion_length": 1609.3333740234375, + "epoch": 0.8977134146341463, + "grad_norm": 0.12314156888941426, + "kl": 0.04638671875, + "learning_rate": 9.4600771547232e-08, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5889 + }, + { + "completion_length": 1265.3333740234375, + "epoch": 0.8978658536585366, + "grad_norm": 0.15440351140375239, + "kl": 0.06396484375, + "learning_rate": 9.432200292196669e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5890 + }, + { + "completion_length": 1937.5000610351562, + "epoch": 0.8980182926829269, + "grad_norm": 0.10588826757976366, + "kl": 0.0594482421875, + "learning_rate": 9.404363230569363e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5891 + }, + { + "completion_length": 898.3333435058594, + "epoch": 0.8981707317073171, + "grad_norm": 0.09084120445477413, + "kl": 0.058837890625, + "learning_rate": 9.376565977723229e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5892 + }, + { + "completion_length": 1326.1666870117188, + "epoch": 0.8983231707317073, + "grad_norm": 0.2955966910614097, + "kl": 0.07177734375, + "learning_rate": 9.34880854152882e-08, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5893 + }, + { + "completion_length": 1542.3333740234375, + "epoch": 0.8984756097560975, + "grad_norm": 0.06401439290997528, + "kl": 0.0518798828125, + "learning_rate": 9.321090929845516e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5894 + }, + { + "completion_length": 1681.166748046875, + "epoch": 0.8986280487804879, + "grad_norm": 0.05976544197157206, + "kl": 0.047607421875, + "learning_rate": 9.293413150521374e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5895 + }, + { + "completion_length": 2099.1666870117188, + "epoch": 0.8987804878048781, + "grad_norm": 0.05114677394679478, + "kl": 0.0460205078125, + "learning_rate": 9.265775211393224e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5896 + }, + { + "completion_length": 947.5, + "epoch": 0.8989329268292683, + "grad_norm": 0.12853500668554077, + "kl": 0.0323486328125, + "learning_rate": 9.238177120286523e-08, + "loss": 0.0013, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5897 + }, + { + "completion_length": 816.6666870117188, + "epoch": 0.8990853658536585, + "grad_norm": 0.10581135416756945, + "kl": 0.06396484375, + "learning_rate": 9.210618885015504e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5898 + }, + { + "completion_length": 966.1666870117188, + "epoch": 0.8992378048780488, + "grad_norm": 0.09930830350312307, + "kl": 0.0638427734375, + "learning_rate": 9.183100513383142e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5899 + }, + { + "completion_length": 750.3333435058594, + "epoch": 0.899390243902439, + "grad_norm": 0.11065768230679822, + "kl": 0.0487060546875, + "learning_rate": 9.15562201318107e-08, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5900 + }, + { + "completion_length": 1480.3333740234375, + "epoch": 0.8995426829268293, + "grad_norm": 0.11007762656343596, + "kl": 0.053955078125, + "learning_rate": 9.128183392189699e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5901 + }, + { + "completion_length": 1431.1666870117188, + "epoch": 0.8996951219512195, + "grad_norm": 0.1385873704087267, + "kl": 0.0758056640625, + "learning_rate": 9.100784658178029e-08, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5902 + }, + { + "completion_length": 1289.8333740234375, + "epoch": 0.8998475609756098, + "grad_norm": 0.10868026963624028, + "kl": 0.0693359375, + "learning_rate": 9.073425818903891e-08, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5903 + }, + { + "completion_length": 1136.8333435058594, + "epoch": 0.9, + "grad_norm": 0.13197921685941535, + "kl": 0.060791015625, + "learning_rate": 9.046106882113752e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5904 + }, + { + "completion_length": 1049.6667175292969, + "epoch": 0.9001524390243902, + "grad_norm": 0.18241437373654013, + "kl": 0.071533203125, + "learning_rate": 9.018827855542811e-08, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5905 + }, + { + "completion_length": 1623.0000610351562, + "epoch": 0.9003048780487805, + "grad_norm": 0.112703199501803, + "kl": 0.0491943359375, + "learning_rate": 8.991588746914952e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5906 + }, + { + "completion_length": 1661.666748046875, + "epoch": 0.9004573170731708, + "grad_norm": 1.2503636425898164, + "kl": 0.0465087890625, + "learning_rate": 8.964389563942743e-08, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5907 + }, + { + "completion_length": 2324.0001220703125, + "epoch": 0.900609756097561, + "grad_norm": 0.08482509060700731, + "kl": 0.058349609375, + "learning_rate": 8.937230314327504e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5908 + }, + { + "completion_length": 2145.6666870117188, + "epoch": 0.9007621951219512, + "grad_norm": 1.0391957646118155, + "kl": 0.0460205078125, + "learning_rate": 8.910111005759186e-08, + "loss": 0.0018, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5909 + }, + { + "completion_length": 993.3333435058594, + "epoch": 0.9009146341463414, + "grad_norm": 0.11082475482856644, + "kl": 0.08349609375, + "learning_rate": 8.883031645916428e-08, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5910 + }, + { + "completion_length": 1768.3333740234375, + "epoch": 0.9010670731707318, + "grad_norm": 0.12628370438741396, + "kl": 0.0782470703125, + "learning_rate": 8.855992242466615e-08, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5911 + }, + { + "completion_length": 3158.166748046875, + "epoch": 0.901219512195122, + "grad_norm": 1.5585794271268654, + "kl": 0.0506591796875, + "learning_rate": 8.828992803065772e-08, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5912 + }, + { + "completion_length": 1973.3333740234375, + "epoch": 0.9013719512195122, + "grad_norm": 0.09968427519646653, + "kl": 0.0653076171875, + "learning_rate": 8.802033335358617e-08, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 5913 + }, + { + "completion_length": 1282.5000610351562, + "epoch": 0.9015243902439024, + "grad_norm": 0.08679285533281787, + "kl": 0.0645751953125, + "learning_rate": 8.775113846978594e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5914 + }, + { + "completion_length": 1585.6666870117188, + "epoch": 0.9016768292682927, + "grad_norm": 0.07927990458809561, + "kl": 0.05157470703125, + "learning_rate": 8.748234345547723e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5915 + }, + { + "completion_length": 1164.0000305175781, + "epoch": 0.901829268292683, + "grad_norm": 0.07103021628759225, + "kl": 0.0362548828125, + "learning_rate": 8.721394838676816e-08, + "loss": 0.0014, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5916 + }, + { + "completion_length": 1223.6666870117188, + "epoch": 0.9019817073170732, + "grad_norm": 0.09032507032604414, + "kl": 0.0479736328125, + "learning_rate": 8.694595333965311e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5917 + }, + { + "completion_length": 1112.1666870117188, + "epoch": 0.9021341463414634, + "grad_norm": 0.08079706853963378, + "kl": 0.0548095703125, + "learning_rate": 8.667835839001287e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5918 + }, + { + "completion_length": 1809.666748046875, + "epoch": 0.9022865853658537, + "grad_norm": 0.10191470395526885, + "kl": 0.0533447265625, + "learning_rate": 8.641116361361584e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5919 + }, + { + "completion_length": 1427.1666870117188, + "epoch": 0.9024390243902439, + "grad_norm": 1.6427052791268524, + "kl": 0.0535888671875, + "learning_rate": 8.614436908611617e-08, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5920 + }, + { + "completion_length": 1006.5, + "epoch": 0.9025914634146341, + "grad_norm": 0.11845561452953583, + "kl": 0.0755615234375, + "learning_rate": 8.587797488305494e-08, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5921 + }, + { + "completion_length": 1162.1666870117188, + "epoch": 0.9027439024390244, + "grad_norm": 0.09711295270315214, + "kl": 0.07080078125, + "learning_rate": 8.561198107986046e-08, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5922 + }, + { + "completion_length": 1102.5000305175781, + "epoch": 0.9028963414634147, + "grad_norm": 0.10125987570332345, + "kl": 0.0638427734375, + "learning_rate": 8.534638775184683e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5923 + }, + { + "completion_length": 1221.1666870117188, + "epoch": 0.9030487804878049, + "grad_norm": 0.11409386185061875, + "kl": 0.0604248046875, + "learning_rate": 8.508119497421524e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5924 + }, + { + "completion_length": 2171.6666870117188, + "epoch": 0.9032012195121951, + "grad_norm": 1.3120808207821635, + "kl": 0.0548095703125, + "learning_rate": 8.48164028220536e-08, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5925 + }, + { + "completion_length": 2123.8333435058594, + "epoch": 0.9033536585365853, + "grad_norm": 0.0606288911017007, + "kl": 0.0487060546875, + "learning_rate": 8.455201137033598e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5926 + }, + { + "completion_length": 1728.1667175292969, + "epoch": 0.9035060975609757, + "grad_norm": 0.19702274450798762, + "kl": 0.0809326171875, + "learning_rate": 8.428802069392327e-08, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5927 + }, + { + "completion_length": 1314.666748046875, + "epoch": 0.9036585365853659, + "grad_norm": 0.07396465138884603, + "kl": 0.04541015625, + "learning_rate": 8.402443086756273e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5928 + }, + { + "completion_length": 1251.0000610351562, + "epoch": 0.9038109756097561, + "grad_norm": 0.12087007282207977, + "kl": 0.060791015625, + "learning_rate": 8.376124196588797e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5929 + }, + { + "completion_length": 1486.3333740234375, + "epoch": 0.9039634146341463, + "grad_norm": 0.0626306755354395, + "kl": 0.049072265625, + "learning_rate": 8.349845406341954e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5930 + }, + { + "completion_length": 2030.8333740234375, + "epoch": 0.9041158536585366, + "grad_norm": 0.1130081942186061, + "kl": 0.06201171875, + "learning_rate": 8.323606723456428e-08, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5931 + }, + { + "completion_length": 1290.6666870117188, + "epoch": 0.9042682926829269, + "grad_norm": 0.09373422427833533, + "kl": 0.046630859375, + "learning_rate": 8.297408155361542e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5932 + }, + { + "completion_length": 873.6666870117188, + "epoch": 0.9044207317073171, + "grad_norm": 0.08591422594010961, + "kl": 0.0447998046875, + "learning_rate": 8.271249709475226e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5933 + }, + { + "completion_length": 1414.3333740234375, + "epoch": 0.9045731707317073, + "grad_norm": 0.06930640397001885, + "kl": 0.0462646484375, + "learning_rate": 8.245131393204092e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5934 + }, + { + "completion_length": 1611.666748046875, + "epoch": 0.9047256097560976, + "grad_norm": 0.08369011858969477, + "kl": 0.0556640625, + "learning_rate": 8.21905321394339e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5935 + }, + { + "completion_length": 1633.166748046875, + "epoch": 0.9048780487804878, + "grad_norm": 0.11150482623631658, + "kl": 0.0517578125, + "learning_rate": 8.193015179076996e-08, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5936 + }, + { + "completion_length": 1116.1666870117188, + "epoch": 0.905030487804878, + "grad_norm": 0.08154942017376349, + "kl": 0.0458984375, + "learning_rate": 8.167017295977414e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5937 + }, + { + "completion_length": 1876.8333740234375, + "epoch": 0.9051829268292683, + "grad_norm": 0.06578097338057604, + "kl": 0.0513916015625, + "learning_rate": 8.14105957200577e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5938 + }, + { + "completion_length": 1365.1666870117188, + "epoch": 0.9053353658536586, + "grad_norm": 0.13605491478151546, + "kl": 0.04638671875, + "learning_rate": 8.115142014511857e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5939 + }, + { + "completion_length": 1837.0000610351562, + "epoch": 0.9054878048780488, + "grad_norm": 0.07983189987063015, + "kl": 0.052978515625, + "learning_rate": 8.089264630834032e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5940 + }, + { + "completion_length": 2227.3333740234375, + "epoch": 0.905640243902439, + "grad_norm": 0.07982709537596411, + "kl": 0.057861328125, + "learning_rate": 8.063427428299386e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5941 + }, + { + "completion_length": 1298.166748046875, + "epoch": 0.9057926829268292, + "grad_norm": 0.12028185648420753, + "kl": 0.05126953125, + "learning_rate": 8.037630414223451e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5942 + }, + { + "completion_length": 2094.5, + "epoch": 0.9059451219512196, + "grad_norm": 0.06387436437695898, + "kl": 0.0572509765625, + "learning_rate": 8.011873595910584e-08, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5943 + }, + { + "completion_length": 2211.0001220703125, + "epoch": 0.9060975609756098, + "grad_norm": 0.07480168480668484, + "kl": 0.059814453125, + "learning_rate": 7.986156980653653e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5944 + }, + { + "completion_length": 1481.8333740234375, + "epoch": 0.90625, + "grad_norm": 0.0657304370848728, + "kl": 0.04559326171875, + "learning_rate": 7.960480575734163e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5945 + }, + { + "completion_length": 1431.1666870117188, + "epoch": 0.9064024390243902, + "grad_norm": 0.09609020741098909, + "kl": 0.055908203125, + "learning_rate": 7.934844388422186e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5946 + }, + { + "completion_length": 1180.5, + "epoch": 0.9065548780487804, + "grad_norm": 0.11034717850295095, + "kl": 0.045166015625, + "learning_rate": 7.909248425976462e-08, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5947 + }, + { + "completion_length": 2727.3333740234375, + "epoch": 0.9067073170731708, + "grad_norm": 0.04781508606356443, + "kl": 0.0382080078125, + "learning_rate": 7.883692695644363e-08, + "loss": 0.0015, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5948 + }, + { + "completion_length": 727.5, + "epoch": 0.906859756097561, + "grad_norm": 0.12312756555980532, + "kl": 0.07861328125, + "learning_rate": 7.858177204661798e-08, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5949 + }, + { + "completion_length": 1782.5000610351562, + "epoch": 0.9070121951219512, + "grad_norm": 1.2657199236757548, + "kl": 0.048828125, + "learning_rate": 7.832701960253358e-08, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5950 + }, + { + "completion_length": 781.8333740234375, + "epoch": 0.9071646341463414, + "grad_norm": 0.08762832651923998, + "kl": 0.043701171875, + "learning_rate": 7.80726696963217e-08, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5951 + }, + { + "completion_length": 1182.1667175292969, + "epoch": 0.9073170731707317, + "grad_norm": 0.1135939364172797, + "kl": 0.050537109375, + "learning_rate": 7.781872239999993e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5952 + }, + { + "completion_length": 2638.0, + "epoch": 0.907469512195122, + "grad_norm": 1.6562096083995272, + "kl": 0.0545654296875, + "learning_rate": 7.75651777854719e-08, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5953 + }, + { + "completion_length": 1538.8333740234375, + "epoch": 0.9076219512195122, + "grad_norm": 1.1529672372667816, + "kl": 0.05322265625, + "learning_rate": 7.731203592452718e-08, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5954 + }, + { + "completion_length": 864.8333435058594, + "epoch": 0.9077743902439024, + "grad_norm": 0.10903364201754547, + "kl": 0.03668212890625, + "learning_rate": 7.705929688884178e-08, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5955 + }, + { + "completion_length": 956.1666870117188, + "epoch": 0.9079268292682927, + "grad_norm": 0.12102559071141092, + "kl": 0.0518798828125, + "learning_rate": 7.680696074997645e-08, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5956 + }, + { + "completion_length": 1753.5000610351562, + "epoch": 0.9080792682926829, + "grad_norm": 0.10003846934313988, + "kl": 0.056884765625, + "learning_rate": 7.655502757937888e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5957 + }, + { + "completion_length": 2641.0, + "epoch": 0.9082317073170731, + "grad_norm": 0.087670465752013, + "kl": 0.044677734375, + "learning_rate": 7.63034974483825e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5958 + }, + { + "completion_length": 1437.1666870117188, + "epoch": 0.9083841463414634, + "grad_norm": 0.09309644355853351, + "kl": 0.065673828125, + "learning_rate": 7.605237042820667e-08, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5959 + }, + { + "completion_length": 1563.666748046875, + "epoch": 0.9085365853658537, + "grad_norm": 0.17351992118396245, + "kl": 0.06640625, + "learning_rate": 7.580164658995603e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5960 + }, + { + "completion_length": 990.8333435058594, + "epoch": 0.9086890243902439, + "grad_norm": 0.1166064816778485, + "kl": 0.0450439453125, + "learning_rate": 7.555132600462144e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5961 + }, + { + "completion_length": 1579.6666870117188, + "epoch": 0.9088414634146341, + "grad_norm": 0.10209921179962865, + "kl": 0.0628662109375, + "learning_rate": 7.530140874308005e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5962 + }, + { + "completion_length": 747.1666870117188, + "epoch": 0.9089939024390243, + "grad_norm": 0.12542101629027108, + "kl": 0.065673828125, + "learning_rate": 7.505189487609427e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5963 + }, + { + "completion_length": 1334.0000610351562, + "epoch": 0.9091463414634147, + "grad_norm": 0.08418189652812985, + "kl": 0.065673828125, + "learning_rate": 7.480278447431221e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5964 + }, + { + "completion_length": 849.3333740234375, + "epoch": 0.9092987804878049, + "grad_norm": 0.1358278963668856, + "kl": 0.071044921875, + "learning_rate": 7.455407760826799e-08, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5965 + }, + { + "completion_length": 1243.8333740234375, + "epoch": 0.9094512195121951, + "grad_norm": 0.07709303038390569, + "kl": 0.0408935546875, + "learning_rate": 7.43057743483816e-08, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5966 + }, + { + "completion_length": 1634.8333435058594, + "epoch": 0.9096036585365853, + "grad_norm": 0.0788888877575864, + "kl": 0.051025390625, + "learning_rate": 7.40578747649583e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5967 + }, + { + "completion_length": 1752.166748046875, + "epoch": 0.9097560975609756, + "grad_norm": 0.09033638957158191, + "kl": 0.062744140625, + "learning_rate": 7.381037892818959e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5968 + }, + { + "completion_length": 2410.166748046875, + "epoch": 0.9099085365853659, + "grad_norm": 1.0258157809581079, + "kl": 0.04736328125, + "learning_rate": 7.356328690815195e-08, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5969 + }, + { + "completion_length": 1266.8333435058594, + "epoch": 0.9100609756097561, + "grad_norm": 0.07069404999297117, + "kl": 0.03631591796875, + "learning_rate": 7.33165987748084e-08, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5970 + }, + { + "completion_length": 1255.8333740234375, + "epoch": 0.9102134146341463, + "grad_norm": 2.380364630163718, + "kl": 0.0655517578125, + "learning_rate": 7.30703145980069e-08, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5971 + }, + { + "completion_length": 1369.3333435058594, + "epoch": 0.9103658536585366, + "grad_norm": 1.7852727431575122, + "kl": 0.060791015625, + "learning_rate": 7.282443444748149e-08, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5972 + }, + { + "completion_length": 1704.166748046875, + "epoch": 0.9105182926829268, + "grad_norm": 0.07004398604767734, + "kl": 0.06591796875, + "learning_rate": 7.257895839285134e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5973 + }, + { + "completion_length": 1840.8334350585938, + "epoch": 0.910670731707317, + "grad_norm": 0.07393375189164043, + "kl": 0.0372314453125, + "learning_rate": 7.233388650362166e-08, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5974 + }, + { + "completion_length": 1779.5000610351562, + "epoch": 0.9108231707317073, + "grad_norm": 0.17751706505199152, + "kl": 0.053466796875, + "learning_rate": 7.208921884918296e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5975 + }, + { + "completion_length": 1208.8333740234375, + "epoch": 0.9109756097560976, + "grad_norm": 0.10946411636976901, + "kl": 0.07177734375, + "learning_rate": 7.184495549881131e-08, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5976 + }, + { + "completion_length": 872.0000610351562, + "epoch": 0.9111280487804878, + "grad_norm": 0.09418709289901948, + "kl": 0.0501708984375, + "learning_rate": 7.160109652166857e-08, + "loss": 0.002, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 5977 + }, + { + "completion_length": 1543.0, + "epoch": 0.911280487804878, + "grad_norm": 0.06289251069628707, + "kl": 0.053466796875, + "learning_rate": 7.135764198680167e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5978 + }, + { + "completion_length": 1153.8333740234375, + "epoch": 0.9114329268292682, + "grad_norm": 0.17610970472198045, + "kl": 0.07421875, + "learning_rate": 7.11145919631433e-08, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5979 + }, + { + "completion_length": 2090.5001220703125, + "epoch": 0.9115853658536586, + "grad_norm": 0.06899763177270622, + "kl": 0.0457763671875, + "learning_rate": 7.087194651951157e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5980 + }, + { + "completion_length": 1469.0000915527344, + "epoch": 0.9117378048780488, + "grad_norm": 2.6073816442446756, + "kl": 0.0543212890625, + "learning_rate": 7.062970572461036e-08, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5981 + }, + { + "completion_length": 1066.1667175292969, + "epoch": 0.911890243902439, + "grad_norm": 0.13521273719388355, + "kl": 0.04559326171875, + "learning_rate": 7.038786964702815e-08, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5982 + }, + { + "completion_length": 897.8333435058594, + "epoch": 0.9120426829268292, + "grad_norm": 0.09705180648200853, + "kl": 0.0611572265625, + "learning_rate": 7.014643835523949e-08, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5983 + }, + { + "completion_length": 1098.6666870117188, + "epoch": 0.9121951219512195, + "grad_norm": 0.08936027747889741, + "kl": 0.054443359375, + "learning_rate": 6.990541191760418e-08, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 5984 + }, + { + "completion_length": 2027.0, + "epoch": 0.9123475609756098, + "grad_norm": 0.07014023982330425, + "kl": 0.044921875, + "learning_rate": 6.966479040236735e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5985 + }, + { + "completion_length": 1951.3334350585938, + "epoch": 0.9125, + "grad_norm": 0.23769132419623742, + "kl": 0.04534912109375, + "learning_rate": 6.942457387765977e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5986 + }, + { + "completion_length": 950.5, + "epoch": 0.9126524390243902, + "grad_norm": 0.1270165337128513, + "kl": 0.0513916015625, + "learning_rate": 6.918476241149674e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5987 + }, + { + "completion_length": 1560.8333740234375, + "epoch": 0.9128048780487805, + "grad_norm": 0.08297387007327996, + "kl": 0.0665283203125, + "learning_rate": 6.894535607177959e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5988 + }, + { + "completion_length": 1071.8333435058594, + "epoch": 0.9129573170731707, + "grad_norm": 0.1272015293962133, + "kl": 0.056396484375, + "learning_rate": 6.870635492629479e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5989 + }, + { + "completion_length": 544.1666870117188, + "epoch": 0.913109756097561, + "grad_norm": 2.2782604514144857, + "kl": 0.0537109375, + "learning_rate": 6.846775904271435e-08, + "loss": 0.0021, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 5990 + }, + { + "completion_length": 1031.8333435058594, + "epoch": 0.9132621951219512, + "grad_norm": 0.09433199303602098, + "kl": 0.063232421875, + "learning_rate": 6.822956848859457e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5991 + }, + { + "completion_length": 1602.0, + "epoch": 0.9134146341463415, + "grad_norm": 0.1162254434478265, + "kl": 0.05517578125, + "learning_rate": 6.799178333137784e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5992 + }, + { + "completion_length": 1438.5, + "epoch": 0.9135670731707317, + "grad_norm": 0.14531101272066993, + "kl": 0.0478515625, + "learning_rate": 6.775440363839181e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5993 + }, + { + "completion_length": 2054.5, + "epoch": 0.9137195121951219, + "grad_norm": 0.11151607044768823, + "kl": 0.051025390625, + "learning_rate": 6.751742947684886e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5994 + }, + { + "completion_length": 1239.666748046875, + "epoch": 0.9138719512195121, + "grad_norm": 0.1324733778975682, + "kl": 0.06298828125, + "learning_rate": 6.7280860913847e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5995 + }, + { + "completion_length": 1560.3333740234375, + "epoch": 0.9140243902439025, + "grad_norm": 0.0755967600687027, + "kl": 0.050537109375, + "learning_rate": 6.704469801636881e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5996 + }, + { + "completion_length": 909.6666870117188, + "epoch": 0.9141768292682927, + "grad_norm": 0.1394328888364734, + "kl": 0.072265625, + "learning_rate": 6.680894085128231e-08, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5997 + }, + { + "completion_length": 1052.0000610351562, + "epoch": 0.9143292682926829, + "grad_norm": 0.0941375895465009, + "kl": 0.05712890625, + "learning_rate": 6.657358948534076e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 5998 + }, + { + "completion_length": 888.1667175292969, + "epoch": 0.9144817073170731, + "grad_norm": 1.8194250381426227, + "kl": 0.0662841796875, + "learning_rate": 6.633864398518285e-08, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 5999 + }, + { + "completion_length": 1252.8333740234375, + "epoch": 0.9146341463414634, + "grad_norm": 0.13763264408626968, + "kl": 0.0643310546875, + "learning_rate": 6.610410441733156e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6000 + }, + { + "completion_length": 2469.666748046875, + "epoch": 0.9147865853658537, + "grad_norm": 0.05898830498851551, + "kl": 0.049072265625, + "learning_rate": 6.586997084819524e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6001 + }, + { + "completion_length": 1019.0000305175781, + "epoch": 0.9149390243902439, + "grad_norm": 0.11267709417314231, + "kl": 0.0555419921875, + "learning_rate": 6.563624334406754e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6002 + }, + { + "completion_length": 2065.666748046875, + "epoch": 0.9150914634146341, + "grad_norm": 0.09449167221872595, + "kl": 0.05615234375, + "learning_rate": 6.540292197112702e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6003 + }, + { + "completion_length": 904.3333435058594, + "epoch": 0.9152439024390244, + "grad_norm": 0.11076251809368902, + "kl": 0.066650390625, + "learning_rate": 6.5170006795437e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6004 + }, + { + "completion_length": 1984.166748046875, + "epoch": 0.9153963414634146, + "grad_norm": 0.07921303360350171, + "kl": 0.0576171875, + "learning_rate": 6.49374978829459e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6005 + }, + { + "completion_length": 1314.666748046875, + "epoch": 0.9155487804878049, + "grad_norm": 0.09069833927897673, + "kl": 0.0540771484375, + "learning_rate": 6.470539529948738e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6006 + }, + { + "completion_length": 2135.0001220703125, + "epoch": 0.9157012195121951, + "grad_norm": 0.08119081105156269, + "kl": 0.0438232421875, + "learning_rate": 6.447369911077955e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6007 + }, + { + "completion_length": 1228.1666870117188, + "epoch": 0.9158536585365854, + "grad_norm": 0.08628968023174471, + "kl": 0.0543212890625, + "learning_rate": 6.424240938242643e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6008 + }, + { + "completion_length": 1330.3333740234375, + "epoch": 0.9160060975609756, + "grad_norm": 0.0734313025920745, + "kl": 0.0394287109375, + "learning_rate": 6.40115261799153e-08, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6009 + }, + { + "completion_length": 783.8333435058594, + "epoch": 0.9161585365853658, + "grad_norm": 0.7276931067303141, + "kl": 0.076171875, + "learning_rate": 6.378104956862002e-08, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6010 + }, + { + "completion_length": 1577.3333740234375, + "epoch": 0.916310975609756, + "grad_norm": 0.09275577519120344, + "kl": 0.0478515625, + "learning_rate": 6.355097961379824e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6011 + }, + { + "completion_length": 1568.0000610351562, + "epoch": 0.9164634146341464, + "grad_norm": 0.09500816896902747, + "kl": 0.07666015625, + "learning_rate": 6.332131638059318e-08, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6012 + }, + { + "completion_length": 1444.666748046875, + "epoch": 0.9166158536585366, + "grad_norm": 0.1188852197523488, + "kl": 0.05059814453125, + "learning_rate": 6.309205993403233e-08, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6013 + }, + { + "completion_length": 1635.1667175292969, + "epoch": 0.9167682926829268, + "grad_norm": 0.15201086135253553, + "kl": 0.0479736328125, + "learning_rate": 6.286321033902825e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6014 + }, + { + "completion_length": 1982.166748046875, + "epoch": 0.916920731707317, + "grad_norm": 0.1724676677815167, + "kl": 0.05126953125, + "learning_rate": 6.263476766037813e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6015 + }, + { + "completion_length": 2295.166748046875, + "epoch": 0.9170731707317074, + "grad_norm": 0.1053299392511371, + "kl": 0.0477294921875, + "learning_rate": 6.24067319627642e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6016 + }, + { + "completion_length": 761.0000457763672, + "epoch": 0.9172256097560976, + "grad_norm": 1.7157796491015718, + "kl": 0.06396484375, + "learning_rate": 6.217910331075366e-08, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6017 + }, + { + "completion_length": 1995.1666870117188, + "epoch": 0.9173780487804878, + "grad_norm": 0.07327959227492033, + "kl": 0.055908203125, + "learning_rate": 6.19518817687978e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6018 + }, + { + "completion_length": 2542.666748046875, + "epoch": 0.917530487804878, + "grad_norm": 0.05970500541257158, + "kl": 0.0438232421875, + "learning_rate": 6.172506740123297e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6019 + }, + { + "completion_length": 1140.8333740234375, + "epoch": 0.9176829268292683, + "grad_norm": 0.29071649327577326, + "kl": 0.068115234375, + "learning_rate": 6.149866027228046e-08, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6020 + }, + { + "completion_length": 2533.3333435058594, + "epoch": 0.9178353658536585, + "grad_norm": 0.08105200426956162, + "kl": 0.0482177734375, + "learning_rate": 6.127266044604618e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6021 + }, + { + "completion_length": 756.5000305175781, + "epoch": 0.9179878048780488, + "grad_norm": 0.15612788722544366, + "kl": 0.0611572265625, + "learning_rate": 6.10470679865201e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6022 + }, + { + "completion_length": 1092.0000610351562, + "epoch": 0.918140243902439, + "grad_norm": 0.09807711942722629, + "kl": 0.048583984375, + "learning_rate": 6.082188295757762e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6023 + }, + { + "completion_length": 1371.1666870117188, + "epoch": 0.9182926829268293, + "grad_norm": 0.08496386219398837, + "kl": 0.0611572265625, + "learning_rate": 6.059710542297824e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6024 + }, + { + "completion_length": 1913.3333435058594, + "epoch": 0.9184451219512195, + "grad_norm": 0.07120725123546306, + "kl": 0.0592041015625, + "learning_rate": 6.037273544636673e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6025 + }, + { + "completion_length": 791.3333740234375, + "epoch": 0.9185975609756097, + "grad_norm": 0.10005632799541139, + "kl": 0.048828125, + "learning_rate": 6.014877309127193e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6026 + }, + { + "completion_length": 1589.3334350585938, + "epoch": 0.91875, + "grad_norm": 0.059935575928810325, + "kl": 0.041259765625, + "learning_rate": 5.99252184211071e-08, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6027 + }, + { + "completion_length": 2088.166748046875, + "epoch": 0.9189024390243903, + "grad_norm": 0.07651003809452751, + "kl": 0.0623779296875, + "learning_rate": 5.970207149917062e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6028 + }, + { + "completion_length": 1425.6666870117188, + "epoch": 0.9190548780487805, + "grad_norm": 0.09743547976445321, + "kl": 0.0609130859375, + "learning_rate": 5.947933238864495e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6029 + }, + { + "completion_length": 783.1666717529297, + "epoch": 0.9192073170731707, + "grad_norm": 1.789891349952761, + "kl": 0.064453125, + "learning_rate": 5.925700115259747e-08, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6030 + }, + { + "completion_length": 807.3333740234375, + "epoch": 0.9193597560975609, + "grad_norm": 0.09310101070022617, + "kl": 0.069580078125, + "learning_rate": 5.903507785398016e-08, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6031 + }, + { + "completion_length": 1248.1666870117188, + "epoch": 0.9195121951219513, + "grad_norm": 0.1101615090030557, + "kl": 0.0379638671875, + "learning_rate": 5.8813562555628585e-08, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6032 + }, + { + "completion_length": 975.3333740234375, + "epoch": 0.9196646341463415, + "grad_norm": 0.09909207905986811, + "kl": 0.0657958984375, + "learning_rate": 5.859245532026375e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6033 + }, + { + "completion_length": 747.5000305175781, + "epoch": 0.9198170731707317, + "grad_norm": 1.6671099389792712, + "kl": 0.0616455078125, + "learning_rate": 5.8371756210490735e-08, + "loss": 0.0025, + "reward": 0.8333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/format_reward": 0.0, + "step": 6034 + }, + { + "completion_length": 1460.0, + "epoch": 0.9199695121951219, + "grad_norm": 0.10350080427517436, + "kl": 0.0599365234375, + "learning_rate": 5.8151465288799386e-08, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6035 + }, + { + "completion_length": 975.8333740234375, + "epoch": 0.9201219512195122, + "grad_norm": 0.10717138229988617, + "kl": 0.052490234375, + "learning_rate": 5.7931582617563316e-08, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6036 + }, + { + "completion_length": 1808.8333740234375, + "epoch": 0.9202743902439025, + "grad_norm": 0.09119816922851331, + "kl": 0.06787109375, + "learning_rate": 5.771210825904122e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6037 + }, + { + "completion_length": 1109.5000610351562, + "epoch": 0.9204268292682927, + "grad_norm": 2.221692091109146, + "kl": 0.078369140625, + "learning_rate": 5.7493042275375716e-08, + "loss": 0.0031, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6038 + }, + { + "completion_length": 607.8333435058594, + "epoch": 0.9205792682926829, + "grad_norm": 0.1459261831860693, + "kl": 0.056640625, + "learning_rate": 5.72743847285942e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6039 + }, + { + "completion_length": 1084.8333740234375, + "epoch": 0.9207317073170732, + "grad_norm": 0.09762282145061563, + "kl": 0.0513916015625, + "learning_rate": 5.7056135680607965e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6040 + }, + { + "completion_length": 1360.6667175292969, + "epoch": 0.9208841463414634, + "grad_norm": 0.0961698347155575, + "kl": 0.0738525390625, + "learning_rate": 5.683829519321293e-08, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6041 + }, + { + "completion_length": 1221.8333435058594, + "epoch": 0.9210365853658536, + "grad_norm": 0.06161359705872977, + "kl": 0.03521728515625, + "learning_rate": 5.662086332808941e-08, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6042 + }, + { + "completion_length": 833.5000305175781, + "epoch": 0.9211890243902439, + "grad_norm": 0.11116500776621771, + "kl": 0.057373046875, + "learning_rate": 5.6403840146801664e-08, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6043 + }, + { + "completion_length": 1072.0, + "epoch": 0.9213414634146342, + "grad_norm": 0.09897190847710713, + "kl": 0.063720703125, + "learning_rate": 5.6187225710798704e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6044 + }, + { + "completion_length": 1675.0000610351562, + "epoch": 0.9214939024390244, + "grad_norm": 0.07651410599458441, + "kl": 0.053955078125, + "learning_rate": 5.597102008141347e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6045 + }, + { + "completion_length": 1117.8333740234375, + "epoch": 0.9216463414634146, + "grad_norm": 0.09297459859175461, + "kl": 0.0640869140625, + "learning_rate": 5.5755223319862994e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6046 + }, + { + "completion_length": 930.8333740234375, + "epoch": 0.9217987804878048, + "grad_norm": 0.11880528772161769, + "kl": 0.06494140625, + "learning_rate": 5.553983548724922e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6047 + }, + { + "completion_length": 1206.8333740234375, + "epoch": 0.9219512195121952, + "grad_norm": 0.17338999261620955, + "kl": 0.0626220703125, + "learning_rate": 5.532485664455755e-08, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6048 + }, + { + "completion_length": 2022.6666870117188, + "epoch": 0.9221036585365854, + "grad_norm": 0.053935891903356604, + "kl": 0.04443359375, + "learning_rate": 5.511028685265812e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6049 + }, + { + "completion_length": 1055.1666870117188, + "epoch": 0.9222560975609756, + "grad_norm": 1.6424490470402089, + "kl": 0.05126953125, + "learning_rate": 5.489612617230483e-08, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6050 + }, + { + "completion_length": 1412.166748046875, + "epoch": 0.9224085365853658, + "grad_norm": 1.803370040481051, + "kl": 0.0472412109375, + "learning_rate": 5.4682374664136167e-08, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6051 + }, + { + "completion_length": 1881.0000610351562, + "epoch": 0.9225609756097561, + "grad_norm": 0.08938564419764229, + "kl": 0.058349609375, + "learning_rate": 5.4469032388674236e-08, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6052 + }, + { + "completion_length": 1779.3333740234375, + "epoch": 0.9227134146341464, + "grad_norm": 0.07539770457810281, + "kl": 0.0584716796875, + "learning_rate": 5.425609940632587e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6053 + }, + { + "completion_length": 1395.166748046875, + "epoch": 0.9228658536585366, + "grad_norm": 0.08476341595704583, + "kl": 0.058349609375, + "learning_rate": 5.404357577738167e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6054 + }, + { + "completion_length": 812.1666870117188, + "epoch": 0.9230182926829268, + "grad_norm": 0.11187659910187052, + "kl": 0.0537109375, + "learning_rate": 5.383146156201618e-08, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6055 + }, + { + "completion_length": 1818.166748046875, + "epoch": 0.9231707317073171, + "grad_norm": 0.08784427177718424, + "kl": 0.0499267578125, + "learning_rate": 5.3619756820288525e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6056 + }, + { + "completion_length": 954.3333740234375, + "epoch": 0.9233231707317073, + "grad_norm": 0.0980107880791078, + "kl": 0.0489501953125, + "learning_rate": 5.3408461612141416e-08, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6057 + }, + { + "completion_length": 2172.5000610351562, + "epoch": 0.9234756097560975, + "grad_norm": 0.07388305359560698, + "kl": 0.0546875, + "learning_rate": 5.319757599740183e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6058 + }, + { + "completion_length": 940.3333435058594, + "epoch": 0.9236280487804878, + "grad_norm": 0.13513334178314912, + "kl": 0.0445556640625, + "learning_rate": 5.29871000357805e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6059 + }, + { + "completion_length": 1496.0, + "epoch": 0.9237804878048781, + "grad_norm": 0.0809271375027792, + "kl": 0.0648193359375, + "learning_rate": 5.2777033786872595e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6060 + }, + { + "completion_length": 810.8333435058594, + "epoch": 0.9239329268292683, + "grad_norm": 0.09481422666604647, + "kl": 0.0604248046875, + "learning_rate": 5.256737731015721e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6061 + }, + { + "completion_length": 1573.0000610351562, + "epoch": 0.9240853658536585, + "grad_norm": 0.07973032989350168, + "kl": 0.0596923828125, + "learning_rate": 5.2358130664997176e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6062 + }, + { + "completion_length": 2168.0000610351562, + "epoch": 0.9242378048780487, + "grad_norm": 1.1943886012954406, + "kl": 0.054443359375, + "learning_rate": 5.214929391063911e-08, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6063 + }, + { + "completion_length": 2267.0001220703125, + "epoch": 0.9243902439024391, + "grad_norm": 0.07716439148162488, + "kl": 0.0545654296875, + "learning_rate": 5.194086710621404e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6064 + }, + { + "completion_length": 1040.0000305175781, + "epoch": 0.9245426829268293, + "grad_norm": 1.3285124856828072, + "kl": 0.06591796875, + "learning_rate": 5.1732850310736766e-08, + "loss": 0.0026, + "reward": 0.8333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.8333333432674408, + "rewards/format_reward": 0.0, + "step": 6065 + }, + { + "completion_length": 853.1666870117188, + "epoch": 0.9246951219512195, + "grad_norm": 0.11130125801601981, + "kl": 0.0733642578125, + "learning_rate": 5.1525243583106005e-08, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6066 + }, + { + "completion_length": 1714.5, + "epoch": 0.9248475609756097, + "grad_norm": 0.34091326060124205, + "kl": 0.045654296875, + "learning_rate": 5.131804698210424e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6067 + }, + { + "completion_length": 2360.83349609375, + "epoch": 0.925, + "grad_norm": 0.07831104672485271, + "kl": 0.052490234375, + "learning_rate": 5.11112605663977e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6068 + }, + { + "completion_length": 1693.8333740234375, + "epoch": 0.9251524390243903, + "grad_norm": 0.06474722129534358, + "kl": 0.044189453125, + "learning_rate": 5.090488439453689e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6069 + }, + { + "completion_length": 734.0000305175781, + "epoch": 0.9253048780487805, + "grad_norm": 3.152191976996699, + "kl": 0.0643310546875, + "learning_rate": 5.069891852495589e-08, + "loss": 0.0026, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 6070 + }, + { + "completion_length": 1562.166748046875, + "epoch": 0.9254573170731707, + "grad_norm": 0.06655594524023728, + "kl": 0.046875, + "learning_rate": 5.0493363015972905e-08, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6071 + }, + { + "completion_length": 1003.6667175292969, + "epoch": 0.925609756097561, + "grad_norm": 1.7423045789396903, + "kl": 0.04266357421875, + "learning_rate": 5.0288217925789025e-08, + "loss": 0.0017, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6072 + }, + { + "completion_length": 1294.6667175292969, + "epoch": 0.9257621951219512, + "grad_norm": 1.607432072163, + "kl": 0.0579833984375, + "learning_rate": 5.008348331249046e-08, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6073 + }, + { + "completion_length": 942.5, + "epoch": 0.9259146341463415, + "grad_norm": 0.0956239054137041, + "kl": 0.0491943359375, + "learning_rate": 4.987915923404635e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6074 + }, + { + "completion_length": 1198.0000610351562, + "epoch": 0.9260670731707317, + "grad_norm": 0.08951282440900703, + "kl": 0.0458984375, + "learning_rate": 4.967524574831006e-08, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6075 + }, + { + "completion_length": 1394.0000610351562, + "epoch": 0.926219512195122, + "grad_norm": 0.11046544539809897, + "kl": 0.0504150390625, + "learning_rate": 4.947174291301776e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6076 + }, + { + "completion_length": 1456.1666870117188, + "epoch": 0.9263719512195122, + "grad_norm": 1.4194879764575674, + "kl": 0.0606689453125, + "learning_rate": 4.9268650785790517e-08, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6077 + }, + { + "completion_length": 1180.3333740234375, + "epoch": 0.9265243902439024, + "grad_norm": 2.2365822981284054, + "kl": 0.0712890625, + "learning_rate": 4.9065969424132496e-08, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6078 + }, + { + "completion_length": 958.5000305175781, + "epoch": 0.9266768292682926, + "grad_norm": 1.8959238440315571, + "kl": 0.0599365234375, + "learning_rate": 4.886369888543163e-08, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6079 + }, + { + "completion_length": 1065.0, + "epoch": 0.926829268292683, + "grad_norm": 0.10008495853930231, + "kl": 0.052001953125, + "learning_rate": 4.86618392269596e-08, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6080 + }, + { + "completion_length": 639.8333740234375, + "epoch": 0.9269817073170732, + "grad_norm": 0.09106182770218071, + "kl": 0.052001953125, + "learning_rate": 4.84603905058717e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6081 + }, + { + "completion_length": 991.0000610351562, + "epoch": 0.9271341463414634, + "grad_norm": 1.883771954093039, + "kl": 0.0709228515625, + "learning_rate": 4.8259352779206786e-08, + "loss": 0.0028, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6082 + }, + { + "completion_length": 974.6666870117188, + "epoch": 0.9272865853658536, + "grad_norm": 0.12073643493782592, + "kl": 0.053466796875, + "learning_rate": 4.805872610388767e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6083 + }, + { + "completion_length": 822.1666870117188, + "epoch": 0.927439024390244, + "grad_norm": 1.9751510210368142, + "kl": 0.094970703125, + "learning_rate": 4.785851053672041e-08, + "loss": 0.0038, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 6084 + }, + { + "completion_length": 635.5000305175781, + "epoch": 0.9275914634146342, + "grad_norm": 0.14956143443944775, + "kl": 0.0640869140625, + "learning_rate": 4.765870613439482e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6085 + }, + { + "completion_length": 1395.5000610351562, + "epoch": 0.9277439024390244, + "grad_norm": 0.0953746583285504, + "kl": 0.074951171875, + "learning_rate": 4.745931295348416e-08, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6086 + }, + { + "completion_length": 910.1666870117188, + "epoch": 0.9278963414634146, + "grad_norm": 1.955985919565516, + "kl": 0.0648193359375, + "learning_rate": 4.7260331050445415e-08, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6087 + }, + { + "completion_length": 1406.8333435058594, + "epoch": 0.9280487804878049, + "grad_norm": 0.06765646258773156, + "kl": 0.0450439453125, + "learning_rate": 4.70617604816192e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6088 + }, + { + "completion_length": 1260.3333740234375, + "epoch": 0.9282012195121951, + "grad_norm": 0.18752269389987003, + "kl": 0.06201171875, + "learning_rate": 4.68636013032297e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6089 + }, + { + "completion_length": 1478.3333740234375, + "epoch": 0.9283536585365854, + "grad_norm": 0.07467584169300279, + "kl": 0.08056640625, + "learning_rate": 4.666585357138387e-08, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6090 + }, + { + "completion_length": 635.5000305175781, + "epoch": 0.9285060975609756, + "grad_norm": 0.27337034746405176, + "kl": 0.0888671875, + "learning_rate": 4.646851734207325e-08, + "loss": 0.0036, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6091 + }, + { + "completion_length": 1041.6666870117188, + "epoch": 0.9286585365853659, + "grad_norm": 0.09146373031324272, + "kl": 0.0550537109375, + "learning_rate": 4.627159267117215e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6092 + }, + { + "completion_length": 1155.1667175292969, + "epoch": 0.9288109756097561, + "grad_norm": 0.08745541345044348, + "kl": 0.0662841796875, + "learning_rate": 4.6075079614438794e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6093 + }, + { + "completion_length": 1231.8333435058594, + "epoch": 0.9289634146341463, + "grad_norm": 0.35356339473056747, + "kl": 0.0556640625, + "learning_rate": 4.587897822751452e-08, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6094 + }, + { + "completion_length": 1496.8333740234375, + "epoch": 0.9291158536585366, + "grad_norm": 0.11590423963520827, + "kl": 0.0601806640625, + "learning_rate": 4.568328856592407e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6095 + }, + { + "completion_length": 2126.166748046875, + "epoch": 0.9292682926829269, + "grad_norm": 0.09421071997392963, + "kl": 0.05517578125, + "learning_rate": 4.54880106850758e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6096 + }, + { + "completion_length": 2022.8334350585938, + "epoch": 0.9294207317073171, + "grad_norm": 0.07962336765056954, + "kl": 0.0537109375, + "learning_rate": 4.529314464026163e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6097 + }, + { + "completion_length": 1937.3334350585938, + "epoch": 0.9295731707317073, + "grad_norm": 0.07295951413797187, + "kl": 0.037109375, + "learning_rate": 4.509869048665643e-08, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6098 + }, + { + "completion_length": 1861.0, + "epoch": 0.9297256097560975, + "grad_norm": 0.10254777854049853, + "kl": 0.067138671875, + "learning_rate": 4.490464827931884e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6099 + }, + { + "completion_length": 2662.0001220703125, + "epoch": 0.9298780487804879, + "grad_norm": 1.0206307296789383, + "kl": 0.0574951171875, + "learning_rate": 4.471101807319072e-08, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6100 + }, + { + "completion_length": 1492.5, + "epoch": 0.9300304878048781, + "grad_norm": 0.10047533944682063, + "kl": 0.05908203125, + "learning_rate": 4.451779992309707e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6101 + }, + { + "completion_length": 2486.0001220703125, + "epoch": 0.9301829268292683, + "grad_norm": 0.1749007593124448, + "kl": 0.068603515625, + "learning_rate": 4.4324993883746614e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6102 + }, + { + "completion_length": 856.8333740234375, + "epoch": 0.9303353658536585, + "grad_norm": 0.0938277179620897, + "kl": 0.0528564453125, + "learning_rate": 4.4132600009731185e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6103 + }, + { + "completion_length": 1346.5000610351562, + "epoch": 0.9304878048780488, + "grad_norm": 0.0877945979239473, + "kl": 0.0369873046875, + "learning_rate": 4.394061835552554e-08, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6104 + }, + { + "completion_length": 1603.666748046875, + "epoch": 0.930640243902439, + "grad_norm": 0.09575842125718699, + "kl": 0.0673828125, + "learning_rate": 4.37490489754887e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6105 + }, + { + "completion_length": 644.6666717529297, + "epoch": 0.9307926829268293, + "grad_norm": 0.10265317364082266, + "kl": 0.061279296875, + "learning_rate": 4.355789192386178e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6106 + }, + { + "completion_length": 975.1667175292969, + "epoch": 0.9309451219512195, + "grad_norm": 0.12222063622926474, + "kl": 0.07421875, + "learning_rate": 4.3367147254770144e-08, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6107 + }, + { + "completion_length": 1532.8334350585938, + "epoch": 0.9310975609756098, + "grad_norm": 0.15378839339165262, + "kl": 0.049072265625, + "learning_rate": 4.317681502222159e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6108 + }, + { + "completion_length": 1264.3333740234375, + "epoch": 0.93125, + "grad_norm": 0.1559794752645207, + "kl": 0.065673828125, + "learning_rate": 4.298689528010785e-08, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 6109 + }, + { + "completion_length": 896.3333435058594, + "epoch": 0.9314024390243902, + "grad_norm": 0.08780797350186689, + "kl": 0.05157470703125, + "learning_rate": 4.279738808220324e-08, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6110 + }, + { + "completion_length": 1219.5, + "epoch": 0.9315548780487805, + "grad_norm": 0.12581083251382139, + "kl": 0.0606689453125, + "learning_rate": 4.2608293482165827e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6111 + }, + { + "completion_length": 1543.3333740234375, + "epoch": 0.9317073170731708, + "grad_norm": 0.15281828376073195, + "kl": 0.072998046875, + "learning_rate": 4.2419611533536296e-08, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6112 + }, + { + "completion_length": 1210.3333435058594, + "epoch": 0.931859756097561, + "grad_norm": 0.08298688587165372, + "kl": 0.0550537109375, + "learning_rate": 4.223134228973891e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6113 + }, + { + "completion_length": 906.3333435058594, + "epoch": 0.9320121951219512, + "grad_norm": 2.870935315726375, + "kl": 0.0576171875, + "learning_rate": 4.204348580408102e-08, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6114 + }, + { + "completion_length": 1095.1666870117188, + "epoch": 0.9321646341463414, + "grad_norm": 1.9309427026802393, + "kl": 0.0594482421875, + "learning_rate": 4.1856042129752915e-08, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6115 + }, + { + "completion_length": 630.0, + "epoch": 0.9323170731707318, + "grad_norm": 1.6615333178765093, + "kl": 0.06298828125, + "learning_rate": 4.1669011319827975e-08, + "loss": 0.0025, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6116 + }, + { + "completion_length": 3469.0001220703125, + "epoch": 0.932469512195122, + "grad_norm": 0.03843877266278798, + "kl": 0.0390625, + "learning_rate": 4.148239342726301e-08, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6117 + }, + { + "completion_length": 993.5000610351562, + "epoch": 0.9326219512195122, + "grad_norm": 0.09844201220313996, + "kl": 0.0521240234375, + "learning_rate": 4.129618850489758e-08, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6118 + }, + { + "completion_length": 1594.166748046875, + "epoch": 0.9327743902439024, + "grad_norm": 0.09225730340574469, + "kl": 0.06787109375, + "learning_rate": 4.1110396605454515e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6119 + }, + { + "completion_length": 1351.0000305175781, + "epoch": 0.9329268292682927, + "grad_norm": 0.1008228969331423, + "kl": 0.051513671875, + "learning_rate": 4.0925017781539896e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6120 + }, + { + "completion_length": 1763.666748046875, + "epoch": 0.933079268292683, + "grad_norm": 0.10163301908865248, + "kl": 0.05029296875, + "learning_rate": 4.0740052085642075e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6121 + }, + { + "completion_length": 1430.1666870117188, + "epoch": 0.9332317073170732, + "grad_norm": 0.14555936249300275, + "kl": 0.077392578125, + "learning_rate": 4.055549957013332e-08, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6122 + }, + { + "completion_length": 1238.0000610351562, + "epoch": 0.9333841463414634, + "grad_norm": 0.080947546159176, + "kl": 0.0732421875, + "learning_rate": 4.0371360287268346e-08, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6123 + }, + { + "completion_length": 2977.666748046875, + "epoch": 0.9335365853658537, + "grad_norm": 0.056507265636839346, + "kl": 0.042724609375, + "learning_rate": 4.018763428918509e-08, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6124 + }, + { + "completion_length": 1384.3333740234375, + "epoch": 0.9336890243902439, + "grad_norm": 0.10466937034610105, + "kl": 0.0650634765625, + "learning_rate": 4.000432162790463e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6125 + }, + { + "completion_length": 781.6666870117188, + "epoch": 0.9338414634146341, + "grad_norm": 0.09971279513652305, + "kl": 0.051025390625, + "learning_rate": 3.9821422355330426e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6126 + }, + { + "completion_length": 1884.3333740234375, + "epoch": 0.9339939024390244, + "grad_norm": 0.08053359532282885, + "kl": 0.06396484375, + "learning_rate": 3.963893652324924e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6127 + }, + { + "completion_length": 1377.5000610351562, + "epoch": 0.9341463414634147, + "grad_norm": 0.09184598906561336, + "kl": 0.0408935546875, + "learning_rate": 3.9456864183331557e-08, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6128 + }, + { + "completion_length": 649.3333587646484, + "epoch": 0.9342987804878049, + "grad_norm": 0.11821565221887645, + "kl": 0.0618896484375, + "learning_rate": 3.9275205387129144e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6129 + }, + { + "completion_length": 2563.5001220703125, + "epoch": 0.9344512195121951, + "grad_norm": 0.11559663024431253, + "kl": 0.0556640625, + "learning_rate": 3.9093960186078024e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6130 + }, + { + "completion_length": 975.8333740234375, + "epoch": 0.9346036585365853, + "grad_norm": 0.07813731760987934, + "kl": 0.0426025390625, + "learning_rate": 3.891312863149649e-08, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6131 + }, + { + "completion_length": 1536.1666870117188, + "epoch": 0.9347560975609757, + "grad_norm": 0.13318126721700202, + "kl": 0.06103515625, + "learning_rate": 3.873271077458607e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6132 + }, + { + "completion_length": 1702.0, + "epoch": 0.9349085365853659, + "grad_norm": 0.43651891737997967, + "kl": 0.0567626953125, + "learning_rate": 3.855270666643074e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6133 + }, + { + "completion_length": 911.1666717529297, + "epoch": 0.9350609756097561, + "grad_norm": 0.1284342391503144, + "kl": 0.0523681640625, + "learning_rate": 3.837311635799773e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6134 + }, + { + "completion_length": 1047.3333740234375, + "epoch": 0.9352134146341463, + "grad_norm": 1.6549836228133816, + "kl": 0.05096435546875, + "learning_rate": 3.8193939900136535e-08, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6135 + }, + { + "completion_length": 2685.83349609375, + "epoch": 0.9353658536585366, + "grad_norm": 0.06596885689757144, + "kl": 0.045654296875, + "learning_rate": 3.80151773435804e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6136 + }, + { + "completion_length": 1769.666748046875, + "epoch": 0.9355182926829269, + "grad_norm": 0.06488382277213155, + "kl": 0.0498046875, + "learning_rate": 3.783682873894434e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6137 + }, + { + "completion_length": 960.0000305175781, + "epoch": 0.9356707317073171, + "grad_norm": 0.17756617985179604, + "kl": 0.0467529296875, + "learning_rate": 3.765889413672713e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6138 + }, + { + "completion_length": 1166.5000305175781, + "epoch": 0.9358231707317073, + "grad_norm": 1.2924518594576326, + "kl": 0.0513916015625, + "learning_rate": 3.7481373587309464e-08, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6139 + }, + { + "completion_length": 814.8333435058594, + "epoch": 0.9359756097560976, + "grad_norm": 0.0974997467193128, + "kl": 0.054443359375, + "learning_rate": 3.7304267140955305e-08, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6140 + }, + { + "completion_length": 1079.8333740234375, + "epoch": 0.9361280487804878, + "grad_norm": 0.08573173330011259, + "kl": 0.0430908203125, + "learning_rate": 3.712757484781121e-08, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6141 + }, + { + "completion_length": 1273.666748046875, + "epoch": 0.936280487804878, + "grad_norm": 0.0994159649116197, + "kl": 0.063232421875, + "learning_rate": 3.695129675790665e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6142 + }, + { + "completion_length": 919.1666870117188, + "epoch": 0.9364329268292683, + "grad_norm": 0.10333267774670184, + "kl": 0.0611572265625, + "learning_rate": 3.6775432921153706e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6143 + }, + { + "completion_length": 2019.3333740234375, + "epoch": 0.9365853658536586, + "grad_norm": 0.10943118932712265, + "kl": 0.058837890625, + "learning_rate": 3.659998338734671e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6144 + }, + { + "completion_length": 1484.166748046875, + "epoch": 0.9367378048780488, + "grad_norm": 0.06254166014892287, + "kl": 0.03985595703125, + "learning_rate": 3.642494820616343e-08, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6145 + }, + { + "completion_length": 1948.3333740234375, + "epoch": 0.936890243902439, + "grad_norm": 0.06261322320853593, + "kl": 0.0574951171875, + "learning_rate": 3.6250327427164054e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6146 + }, + { + "completion_length": 962.3333435058594, + "epoch": 0.9370426829268292, + "grad_norm": 0.0856860870341909, + "kl": 0.0693359375, + "learning_rate": 3.607612109979136e-08, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6147 + }, + { + "completion_length": 1040.1667175292969, + "epoch": 0.9371951219512196, + "grad_norm": 0.1539449923011917, + "kl": 0.0665283203125, + "learning_rate": 3.590232927337056e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6148 + }, + { + "completion_length": 1395.8334350585938, + "epoch": 0.9373475609756098, + "grad_norm": 1.2622261332186904, + "kl": 0.0517578125, + "learning_rate": 3.572895199710979e-08, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6149 + }, + { + "completion_length": 1824.6666870117188, + "epoch": 0.9375, + "grad_norm": 0.15724419294060443, + "kl": 0.0621337890625, + "learning_rate": 3.5555989320099955e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6150 + }, + { + "completion_length": 1369.1666870117188, + "epoch": 0.9376524390243902, + "grad_norm": 0.10302939266609298, + "kl": 0.0540771484375, + "learning_rate": 3.538344129131421e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6151 + }, + { + "completion_length": 1876.6666870117188, + "epoch": 0.9378048780487804, + "grad_norm": 0.08866835383983158, + "kl": 0.0487060546875, + "learning_rate": 3.5211307959608475e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6152 + }, + { + "completion_length": 777.5000305175781, + "epoch": 0.9379573170731708, + "grad_norm": 0.10263737202058551, + "kl": 0.05322265625, + "learning_rate": 3.503958937372126e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6153 + }, + { + "completion_length": 769.3333435058594, + "epoch": 0.938109756097561, + "grad_norm": 0.18378220845100274, + "kl": 0.0445556640625, + "learning_rate": 3.486828558227351e-08, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6154 + }, + { + "completion_length": 1944.5001220703125, + "epoch": 0.9382621951219512, + "grad_norm": 0.06591299373304114, + "kl": 0.0513916015625, + "learning_rate": 3.469739663376892e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6155 + }, + { + "completion_length": 1698.1666870117188, + "epoch": 0.9384146341463414, + "grad_norm": 0.12195062949054945, + "kl": 0.0703125, + "learning_rate": 3.452692257659379e-08, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6156 + }, + { + "completion_length": 1937.166748046875, + "epoch": 0.9385670731707317, + "grad_norm": 0.17176599631439732, + "kl": 0.05419921875, + "learning_rate": 3.4356863459016505e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6157 + }, + { + "completion_length": 773.3333435058594, + "epoch": 0.938719512195122, + "grad_norm": 0.0875167275129952, + "kl": 0.0440673828125, + "learning_rate": 3.418721932918839e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6158 + }, + { + "completion_length": 1882.3333740234375, + "epoch": 0.9388719512195122, + "grad_norm": 0.07065264929307047, + "kl": 0.0465087890625, + "learning_rate": 3.401799023514318e-08, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6159 + }, + { + "completion_length": 1495.5000610351562, + "epoch": 0.9390243902439024, + "grad_norm": 1.5756876457110884, + "kl": 0.074462890625, + "learning_rate": 3.3849176224796884e-08, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6160 + }, + { + "completion_length": 1931.3333740234375, + "epoch": 0.9391768292682927, + "grad_norm": 1.411411693481174, + "kl": 0.0616455078125, + "learning_rate": 3.368077734594827e-08, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6161 + }, + { + "completion_length": 928.1666870117188, + "epoch": 0.9393292682926829, + "grad_norm": 0.13270974999795618, + "kl": 0.0614013671875, + "learning_rate": 3.351279364627835e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6162 + }, + { + "completion_length": 1267.3333740234375, + "epoch": 0.9394817073170731, + "grad_norm": 0.10457508794873356, + "kl": 0.051025390625, + "learning_rate": 3.334522517335076e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6163 + }, + { + "completion_length": 2291.0001220703125, + "epoch": 0.9396341463414634, + "grad_norm": 0.0673063932905054, + "kl": 0.055419921875, + "learning_rate": 3.317807197461137e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6164 + }, + { + "completion_length": 1901.5, + "epoch": 0.9397865853658537, + "grad_norm": 0.13958183215819767, + "kl": 0.0616455078125, + "learning_rate": 3.301133409738849e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6165 + }, + { + "completion_length": 982.5000305175781, + "epoch": 0.9399390243902439, + "grad_norm": 0.117350329175574, + "kl": 0.0701904296875, + "learning_rate": 3.284501158889319e-08, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6166 + }, + { + "completion_length": 1562.8333740234375, + "epoch": 0.9400914634146341, + "grad_norm": 0.09976855061760476, + "kl": 0.072265625, + "learning_rate": 3.267910449621847e-08, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6167 + }, + { + "completion_length": 2086.166748046875, + "epoch": 0.9402439024390243, + "grad_norm": 0.08229576506691319, + "kl": 0.0469970703125, + "learning_rate": 3.2513612866339916e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6168 + }, + { + "completion_length": 1515.166748046875, + "epoch": 0.9403963414634147, + "grad_norm": 0.12801923429785314, + "kl": 0.084228515625, + "learning_rate": 3.234853674611554e-08, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6169 + }, + { + "completion_length": 1526.8334197998047, + "epoch": 0.9405487804878049, + "grad_norm": 0.0841923137957738, + "kl": 0.0491943359375, + "learning_rate": 3.2183876182285466e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6170 + }, + { + "completion_length": 1224.8333740234375, + "epoch": 0.9407012195121951, + "grad_norm": 0.08602804795803243, + "kl": 0.0548095703125, + "learning_rate": 3.201963122147239e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6171 + }, + { + "completion_length": 1986.666748046875, + "epoch": 0.9408536585365853, + "grad_norm": 0.09099651401089326, + "kl": 0.0570068359375, + "learning_rate": 3.185580191018128e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6172 + }, + { + "completion_length": 967.8333740234375, + "epoch": 0.9410060975609756, + "grad_norm": 1.5288441208173988, + "kl": 0.0533447265625, + "learning_rate": 3.169238829479937e-08, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6173 + }, + { + "completion_length": 1581.0000610351562, + "epoch": 0.9411585365853659, + "grad_norm": 0.08920376653268382, + "kl": 0.0552978515625, + "learning_rate": 3.1529390421596305e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6174 + }, + { + "completion_length": 1462.5, + "epoch": 0.9413109756097561, + "grad_norm": 0.11073522799340205, + "kl": 0.0645751953125, + "learning_rate": 3.136680833672367e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6175 + }, + { + "completion_length": 1099.3333435058594, + "epoch": 0.9414634146341463, + "grad_norm": 0.1374592494385013, + "kl": 0.0753173828125, + "learning_rate": 3.1204642086215817e-08, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6176 + }, + { + "completion_length": 1077.0, + "epoch": 0.9416158536585366, + "grad_norm": 0.0865822650934946, + "kl": 0.05078125, + "learning_rate": 3.1042891715989007e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6177 + }, + { + "completion_length": 1440.3333740234375, + "epoch": 0.9417682926829268, + "grad_norm": 0.09396537835871252, + "kl": 0.07666015625, + "learning_rate": 3.088155727184194e-08, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6178 + }, + { + "completion_length": 2345.5001220703125, + "epoch": 0.941920731707317, + "grad_norm": 0.07513466951546581, + "kl": 0.0501708984375, + "learning_rate": 3.072063879945525e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6179 + }, + { + "completion_length": 1428.6666870117188, + "epoch": 0.9420731707317073, + "grad_norm": 0.36682662053459736, + "kl": 0.0799560546875, + "learning_rate": 3.056013634439198e-08, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6180 + }, + { + "completion_length": 1565.0000610351562, + "epoch": 0.9422256097560976, + "grad_norm": 0.09129602388961108, + "kl": 0.0504150390625, + "learning_rate": 3.0400049952097776e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6181 + }, + { + "completion_length": 1221.8333740234375, + "epoch": 0.9423780487804878, + "grad_norm": 0.06601308523463127, + "kl": 0.0447998046875, + "learning_rate": 3.024037966789972e-08, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6182 + }, + { + "completion_length": 2071.0, + "epoch": 0.942530487804878, + "grad_norm": 0.0752501613414769, + "kl": 0.0523681640625, + "learning_rate": 3.008112553700781e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6183 + }, + { + "completion_length": 988.0000610351562, + "epoch": 0.9426829268292682, + "grad_norm": 0.11570200145080624, + "kl": 0.0555419921875, + "learning_rate": 2.992228760451349e-08, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6184 + }, + { + "completion_length": 1538.5000610351562, + "epoch": 0.9428353658536586, + "grad_norm": 0.12745767980105793, + "kl": 0.052734375, + "learning_rate": 2.9763865915391098e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6185 + }, + { + "completion_length": 2045.5, + "epoch": 0.9429878048780488, + "grad_norm": 0.07377372241184126, + "kl": 0.0667724609375, + "learning_rate": 2.960586051449643e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6186 + }, + { + "completion_length": 1002.5000610351562, + "epoch": 0.943140243902439, + "grad_norm": 0.12123464321879636, + "kl": 0.0482177734375, + "learning_rate": 2.9448271446568198e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6187 + }, + { + "completion_length": 2307.0, + "epoch": 0.9432926829268292, + "grad_norm": 0.06336216132183763, + "kl": 0.051513671875, + "learning_rate": 2.929109875622621e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6188 + }, + { + "completion_length": 1118.1666870117188, + "epoch": 0.9434451219512195, + "grad_norm": 0.10654345044088621, + "kl": 0.052978515625, + "learning_rate": 2.9134342487973197e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6189 + }, + { + "completion_length": 3359.8333740234375, + "epoch": 0.9435975609756098, + "grad_norm": 0.8965377266394952, + "kl": 0.0457763671875, + "learning_rate": 2.897800268619366e-08, + "loss": 0.0018, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6190 + }, + { + "completion_length": 1309.0000305175781, + "epoch": 0.94375, + "grad_norm": 0.0711125212320331, + "kl": 0.0491943359375, + "learning_rate": 2.8822079395154353e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6191 + }, + { + "completion_length": 1453.1667175292969, + "epoch": 0.9439024390243902, + "grad_norm": 0.14865690889752323, + "kl": 0.0592041015625, + "learning_rate": 2.8666572659003965e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6192 + }, + { + "completion_length": 1110.8333740234375, + "epoch": 0.9440548780487805, + "grad_norm": 0.08001736607544821, + "kl": 0.054931640625, + "learning_rate": 2.8511482521773104e-08, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6193 + }, + { + "completion_length": 1082.8333435058594, + "epoch": 0.9442073170731707, + "grad_norm": 0.09291743625913913, + "kl": 0.0450439453125, + "learning_rate": 2.8356809027374807e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6194 + }, + { + "completion_length": 1605.666748046875, + "epoch": 0.944359756097561, + "grad_norm": 0.09798602585027441, + "kl": 0.0579833984375, + "learning_rate": 2.8202552219603718e-08, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6195 + }, + { + "completion_length": 1267.6666870117188, + "epoch": 0.9445121951219512, + "grad_norm": 0.11845910887924087, + "kl": 0.0584716796875, + "learning_rate": 2.804871214213689e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6196 + }, + { + "completion_length": 966.5000305175781, + "epoch": 0.9446646341463415, + "grad_norm": 1.7957625553861107, + "kl": 0.0533447265625, + "learning_rate": 2.7895288838532983e-08, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6197 + }, + { + "completion_length": 1400.8333740234375, + "epoch": 0.9448170731707317, + "grad_norm": 1.602424858734981, + "kl": 0.051025390625, + "learning_rate": 2.774228235223292e-08, + "loss": 0.002, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 6198 + }, + { + "completion_length": 1378.8333740234375, + "epoch": 0.9449695121951219, + "grad_norm": 0.07485269807741587, + "kl": 0.0537109375, + "learning_rate": 2.7589692726559536e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6199 + }, + { + "completion_length": 1282.1666870117188, + "epoch": 0.9451219512195121, + "grad_norm": 0.1067497578983425, + "kl": 0.055419921875, + "learning_rate": 2.743752000471761e-08, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6200 + }, + { + "completion_length": 1439.3333740234375, + "epoch": 0.9452743902439025, + "grad_norm": 0.1241564965695072, + "kl": 0.0374755859375, + "learning_rate": 2.7285764229794008e-08, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6201 + }, + { + "completion_length": 854.3333435058594, + "epoch": 0.9454268292682927, + "grad_norm": 0.11468100325760121, + "kl": 0.0660400390625, + "learning_rate": 2.713442544475736e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6202 + }, + { + "completion_length": 1160.5, + "epoch": 0.9455792682926829, + "grad_norm": 0.09540747110124378, + "kl": 0.056884765625, + "learning_rate": 2.698350369245839e-08, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6203 + }, + { + "completion_length": 2572.8333740234375, + "epoch": 0.9457317073170731, + "grad_norm": 0.3114972053696646, + "kl": 0.0684814453125, + "learning_rate": 2.6832999015629577e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6204 + }, + { + "completion_length": 1131.8333740234375, + "epoch": 0.9458841463414634, + "grad_norm": 0.0714467440222685, + "kl": 0.0384521484375, + "learning_rate": 2.6682911456885505e-08, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6205 + }, + { + "completion_length": 1047.3333740234375, + "epoch": 0.9460365853658537, + "grad_norm": 0.12947696282599436, + "kl": 0.065185546875, + "learning_rate": 2.653324105872218e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6206 + }, + { + "completion_length": 1740.8334350585938, + "epoch": 0.9461890243902439, + "grad_norm": 0.08138026631731426, + "kl": 0.048583984375, + "learning_rate": 2.63839878635182e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6207 + }, + { + "completion_length": 1752.8333435058594, + "epoch": 0.9463414634146341, + "grad_norm": 0.08978940420278525, + "kl": 0.0467529296875, + "learning_rate": 2.6235151913533595e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6208 + }, + { + "completion_length": 880.8333740234375, + "epoch": 0.9464939024390244, + "grad_norm": 0.0692673566221876, + "kl": 0.032470703125, + "learning_rate": 2.608673325091032e-08, + "loss": 0.0013, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6209 + }, + { + "completion_length": 2194.5000610351562, + "epoch": 0.9466463414634146, + "grad_norm": 1.262537218096916, + "kl": 0.04931640625, + "learning_rate": 2.59387319176721e-08, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6210 + }, + { + "completion_length": 1317.3333740234375, + "epoch": 0.9467987804878049, + "grad_norm": 0.1475129866544426, + "kl": 0.062744140625, + "learning_rate": 2.5791147955724737e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6211 + }, + { + "completion_length": 1382.3333435058594, + "epoch": 0.9469512195121951, + "grad_norm": 1.2482641435670319, + "kl": 0.0487060546875, + "learning_rate": 2.5643981406855642e-08, + "loss": 0.0019, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6212 + }, + { + "completion_length": 3142.666748046875, + "epoch": 0.9471036585365854, + "grad_norm": 0.04223693350361157, + "kl": 0.0419921875, + "learning_rate": 2.5497232312733985e-08, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6213 + }, + { + "completion_length": 1870.5000915527344, + "epoch": 0.9472560975609756, + "grad_norm": 0.1310296555202849, + "kl": 0.0604248046875, + "learning_rate": 2.5350900714911363e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6214 + }, + { + "completion_length": 1028.5000305175781, + "epoch": 0.9474085365853658, + "grad_norm": 0.08478155632407346, + "kl": 0.0540771484375, + "learning_rate": 2.520498665481996e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6215 + }, + { + "completion_length": 1579.166748046875, + "epoch": 0.947560975609756, + "grad_norm": 0.09217651472719142, + "kl": 0.065185546875, + "learning_rate": 2.50594901737749e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6216 + }, + { + "completion_length": 1515.166748046875, + "epoch": 0.9477134146341464, + "grad_norm": 0.0828187984703424, + "kl": 0.06005859375, + "learning_rate": 2.4914411312972397e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6217 + }, + { + "completion_length": 1593.8333740234375, + "epoch": 0.9478658536585366, + "grad_norm": 0.07497094065860138, + "kl": 0.044921875, + "learning_rate": 2.476975011349075e-08, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6218 + }, + { + "completion_length": 1690.1666870117188, + "epoch": 0.9480182926829268, + "grad_norm": 1.2166631387398827, + "kl": 0.0538330078125, + "learning_rate": 2.4625506616289873e-08, + "loss": 0.0021, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6219 + }, + { + "completion_length": 1714.0000915527344, + "epoch": 0.948170731707317, + "grad_norm": 0.10930501245141641, + "kl": 0.047607421875, + "learning_rate": 2.4481680862211418e-08, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6220 + }, + { + "completion_length": 886.5000305175781, + "epoch": 0.9483231707317074, + "grad_norm": 0.1253636051564492, + "kl": 0.039306640625, + "learning_rate": 2.4338272891978653e-08, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6221 + }, + { + "completion_length": 1227.5000610351562, + "epoch": 0.9484756097560976, + "grad_norm": 0.09277470199453743, + "kl": 0.046630859375, + "learning_rate": 2.4195282746196755e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6222 + }, + { + "completion_length": 1841.8333740234375, + "epoch": 0.9486280487804878, + "grad_norm": 0.1433194688725106, + "kl": 0.0615234375, + "learning_rate": 2.4052710465352513e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6223 + }, + { + "completion_length": 2122.1666870117188, + "epoch": 0.948780487804878, + "grad_norm": 0.06580122786712773, + "kl": 0.0587158203125, + "learning_rate": 2.3910556089814294e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6224 + }, + { + "completion_length": 2512.8333740234375, + "epoch": 0.9489329268292683, + "grad_norm": 0.9321193396048898, + "kl": 0.0550537109375, + "learning_rate": 2.376881965983224e-08, + "loss": 0.0022, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6225 + }, + { + "completion_length": 822.8333740234375, + "epoch": 0.9490853658536585, + "grad_norm": 0.13500247602867455, + "kl": 0.04833984375, + "learning_rate": 2.3627501215538083e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6226 + }, + { + "completion_length": 1711.6666870117188, + "epoch": 0.9492378048780488, + "grad_norm": 0.06245227218217376, + "kl": 0.0576171875, + "learning_rate": 2.3486600796945644e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6227 + }, + { + "completion_length": 805.1666870117188, + "epoch": 0.949390243902439, + "grad_norm": 1.399797139344873, + "kl": 0.0545654296875, + "learning_rate": 2.334611844394935e-08, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6228 + }, + { + "completion_length": 1670.1666870117188, + "epoch": 0.9495426829268293, + "grad_norm": 0.07012903046706774, + "kl": 0.048583984375, + "learning_rate": 2.3206054196326375e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6229 + }, + { + "completion_length": 1464.6666870117188, + "epoch": 0.9496951219512195, + "grad_norm": 1.7819489534449322, + "kl": 0.0594482421875, + "learning_rate": 2.3066408093735002e-08, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6230 + }, + { + "completion_length": 1839.8333740234375, + "epoch": 0.9498475609756097, + "grad_norm": 0.06041990108269951, + "kl": 0.0465087890625, + "learning_rate": 2.2927180175714935e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6231 + }, + { + "completion_length": 887.0000305175781, + "epoch": 0.95, + "grad_norm": 0.46947003127441816, + "kl": 0.0740966796875, + "learning_rate": 2.278837048168797e-08, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6232 + }, + { + "completion_length": 1496.1666870117188, + "epoch": 0.9501524390243903, + "grad_norm": 0.07154216629135583, + "kl": 0.0557861328125, + "learning_rate": 2.2649979050957003e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6233 + }, + { + "completion_length": 1076.1666870117188, + "epoch": 0.9503048780487805, + "grad_norm": 0.15642926524480255, + "kl": 0.05615234375, + "learning_rate": 2.251200592270686e-08, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6234 + }, + { + "completion_length": 1911.3333435058594, + "epoch": 0.9504573170731707, + "grad_norm": 0.07191361969931262, + "kl": 0.0380859375, + "learning_rate": 2.2374451136003614e-08, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6235 + }, + { + "completion_length": 1511.5000610351562, + "epoch": 0.9506097560975609, + "grad_norm": 0.06457490681111588, + "kl": 0.03875732421875, + "learning_rate": 2.223731472979512e-08, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6236 + }, + { + "completion_length": 1495.5000610351562, + "epoch": 0.9507621951219513, + "grad_norm": 0.08765350071653724, + "kl": 0.043701171875, + "learning_rate": 2.2100596742910817e-08, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6237 + }, + { + "completion_length": 2049.0, + "epoch": 0.9509146341463415, + "grad_norm": 0.04276045694303031, + "kl": 0.0367431640625, + "learning_rate": 2.1964297214061236e-08, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6238 + }, + { + "completion_length": 2072.5000610351562, + "epoch": 0.9510670731707317, + "grad_norm": 0.14310999576511246, + "kl": 0.04736328125, + "learning_rate": 2.1828416181839007e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6239 + }, + { + "completion_length": 690.0000305175781, + "epoch": 0.9512195121951219, + "grad_norm": 2.1763609135681175, + "kl": 0.0672607421875, + "learning_rate": 2.1692953684718187e-08, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6240 + }, + { + "completion_length": 1192.6666870117188, + "epoch": 0.9513719512195122, + "grad_norm": 0.10357879417521137, + "kl": 0.054443359375, + "learning_rate": 2.1557909761053764e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6241 + }, + { + "completion_length": 623.5, + "epoch": 0.9515243902439025, + "grad_norm": 0.13714159038586418, + "kl": 0.0694580078125, + "learning_rate": 2.1423284449082648e-08, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6242 + }, + { + "completion_length": 656.3333740234375, + "epoch": 0.9516768292682927, + "grad_norm": 0.11457157107791129, + "kl": 0.0498046875, + "learning_rate": 2.1289077786923182e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6243 + }, + { + "completion_length": 2329.5, + "epoch": 0.9518292682926829, + "grad_norm": 0.07557086526281527, + "kl": 0.05255126953125, + "learning_rate": 2.1155289812575305e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6244 + }, + { + "completion_length": 1138.0000610351562, + "epoch": 0.9519817073170732, + "grad_norm": 1.1568431162563808, + "kl": 0.0443115234375, + "learning_rate": 2.1021920563920217e-08, + "loss": 0.0018, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 6245 + }, + { + "completion_length": 1385.3334350585938, + "epoch": 0.9521341463414634, + "grad_norm": 0.09624693381964927, + "kl": 0.050537109375, + "learning_rate": 2.088897007872037e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6246 + }, + { + "completion_length": 1272.0000305175781, + "epoch": 0.9522865853658536, + "grad_norm": 0.0948041512979397, + "kl": 0.05224609375, + "learning_rate": 2.0756438394620158e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6247 + }, + { + "completion_length": 1804.166748046875, + "epoch": 0.9524390243902439, + "grad_norm": 0.10501058832162975, + "kl": 0.0594482421875, + "learning_rate": 2.0624325549144894e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6248 + }, + { + "completion_length": 2100.0, + "epoch": 0.9525914634146342, + "grad_norm": 0.059358940689113446, + "kl": 0.058837890625, + "learning_rate": 2.0492631579701493e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6249 + }, + { + "completion_length": 1718.8333435058594, + "epoch": 0.9527439024390244, + "grad_norm": 0.13564216141357527, + "kl": 0.043701171875, + "learning_rate": 2.0361356523578624e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6250 + }, + { + "completion_length": 1877.3333740234375, + "epoch": 0.9528963414634146, + "grad_norm": 0.09493950608122241, + "kl": 0.055419921875, + "learning_rate": 2.023050041794555e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6251 + }, + { + "completion_length": 1724.8334350585938, + "epoch": 0.9530487804878048, + "grad_norm": 0.06644880911536083, + "kl": 0.047119140625, + "learning_rate": 2.0100063299853645e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6252 + }, + { + "completion_length": 2201.3333740234375, + "epoch": 0.9532012195121952, + "grad_norm": 0.07510912735541662, + "kl": 0.045654296875, + "learning_rate": 1.997004520623519e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6253 + }, + { + "completion_length": 1054.0000610351562, + "epoch": 0.9533536585365854, + "grad_norm": 1.8946588023974533, + "kl": 0.0697021484375, + "learning_rate": 1.984044617390407e-08, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6254 + }, + { + "completion_length": 1031.8333740234375, + "epoch": 0.9535060975609756, + "grad_norm": 2.2867702229327453, + "kl": 0.074951171875, + "learning_rate": 1.971126623955577e-08, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6255 + }, + { + "completion_length": 1256.0000305175781, + "epoch": 0.9536585365853658, + "grad_norm": 0.09862652907737388, + "kl": 0.05029296875, + "learning_rate": 1.9582505439766028e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6256 + }, + { + "completion_length": 1253.0000610351562, + "epoch": 0.9538109756097561, + "grad_norm": 0.1133916698967416, + "kl": 0.05224609375, + "learning_rate": 1.9454163810993352e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6257 + }, + { + "completion_length": 1003.0000610351562, + "epoch": 0.9539634146341464, + "grad_norm": 0.15253456297687693, + "kl": 0.101806640625, + "learning_rate": 1.9326241389576837e-08, + "loss": 0.0041, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6258 + }, + { + "completion_length": 1535.8333740234375, + "epoch": 0.9541158536585366, + "grad_norm": 1.3997385763995933, + "kl": 0.0565185546875, + "learning_rate": 1.919873821173651e-08, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6259 + }, + { + "completion_length": 1782.3334350585938, + "epoch": 0.9542682926829268, + "grad_norm": 0.07314904064368233, + "kl": 0.04296875, + "learning_rate": 1.9071654313574495e-08, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6260 + }, + { + "completion_length": 2058.3333740234375, + "epoch": 0.9544207317073171, + "grad_norm": 0.06508427434077795, + "kl": 0.0498046875, + "learning_rate": 1.8944989731073503e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6261 + }, + { + "completion_length": 2052.0000915527344, + "epoch": 0.9545731707317073, + "grad_norm": 0.06887769284819839, + "kl": 0.0439453125, + "learning_rate": 1.881874450009802e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6262 + }, + { + "completion_length": 1054.6666870117188, + "epoch": 0.9547256097560975, + "grad_norm": 0.288846391938912, + "kl": 0.0792236328125, + "learning_rate": 1.869291865639361e-08, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6263 + }, + { + "completion_length": 1017.8333740234375, + "epoch": 0.9548780487804878, + "grad_norm": 0.09316792355322517, + "kl": 0.05029296875, + "learning_rate": 1.856751223558695e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6264 + }, + { + "completion_length": 1521.166748046875, + "epoch": 0.9550304878048781, + "grad_norm": 0.19542557747569328, + "kl": 0.0830078125, + "learning_rate": 1.8442525273186127e-08, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6265 + }, + { + "completion_length": 1243.1666870117188, + "epoch": 0.9551829268292683, + "grad_norm": 0.11249156434625657, + "kl": 0.079345703125, + "learning_rate": 1.8317957804580508e-08, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6266 + }, + { + "completion_length": 1002.8333740234375, + "epoch": 0.9553353658536585, + "grad_norm": 0.0870092868556787, + "kl": 0.056396484375, + "learning_rate": 1.8193809865040377e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6267 + }, + { + "completion_length": 1853.8333740234375, + "epoch": 0.9554878048780487, + "grad_norm": 0.11096331414019862, + "kl": 0.0650634765625, + "learning_rate": 1.807008148971795e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6268 + }, + { + "completion_length": 1726.166748046875, + "epoch": 0.9556402439024391, + "grad_norm": 1.69296363905747, + "kl": 0.069091796875, + "learning_rate": 1.7946772713645533e-08, + "loss": 0.0028, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6269 + }, + { + "completion_length": 2110.3333435058594, + "epoch": 0.9557926829268293, + "grad_norm": 1.9900024169414752, + "kl": 0.0625, + "learning_rate": 1.7823883571737532e-08, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6270 + }, + { + "completion_length": 1871.6667175292969, + "epoch": 0.9559451219512195, + "grad_norm": 1.6349124117939713, + "kl": 0.068603515625, + "learning_rate": 1.770141409878928e-08, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6271 + }, + { + "completion_length": 1897.8334350585938, + "epoch": 0.9560975609756097, + "grad_norm": 0.1227951205086593, + "kl": 0.0477294921875, + "learning_rate": 1.7579364329477375e-08, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6272 + }, + { + "completion_length": 1379.0, + "epoch": 0.95625, + "grad_norm": 0.12752304643324536, + "kl": 0.0509033203125, + "learning_rate": 1.7457734298359006e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6273 + }, + { + "completion_length": 1195.6666870117188, + "epoch": 0.9564024390243903, + "grad_norm": 0.0987559947390147, + "kl": 0.04931640625, + "learning_rate": 1.733652403987329e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6274 + }, + { + "completion_length": 798.6666870117188, + "epoch": 0.9565548780487805, + "grad_norm": 0.17596816255997855, + "kl": 0.0528564453125, + "learning_rate": 1.721573358834011e-08, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6275 + }, + { + "completion_length": 2222.8333740234375, + "epoch": 0.9567073170731707, + "grad_norm": 0.07011286951006196, + "kl": 0.0631103515625, + "learning_rate": 1.7095362977960605e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6276 + }, + { + "completion_length": 1393.6666870117188, + "epoch": 0.956859756097561, + "grad_norm": 0.08411890183655266, + "kl": 0.068115234375, + "learning_rate": 1.697541224281668e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6277 + }, + { + "completion_length": 812.5000305175781, + "epoch": 0.9570121951219512, + "grad_norm": 0.09241987863179127, + "kl": 0.0582275390625, + "learning_rate": 1.6855881416872e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6278 + }, + { + "completion_length": 1052.5000610351562, + "epoch": 0.9571646341463415, + "grad_norm": 0.6707939531484253, + "kl": 0.0810546875, + "learning_rate": 1.6736770533970823e-08, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6279 + }, + { + "completion_length": 1525.5, + "epoch": 0.9573170731707317, + "grad_norm": 0.07063447634553642, + "kl": 0.0565185546875, + "learning_rate": 1.661807962783851e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6280 + }, + { + "completion_length": 1255.1666870117188, + "epoch": 0.957469512195122, + "grad_norm": 0.0807048394293014, + "kl": 0.0528564453125, + "learning_rate": 1.649980873208201e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6281 + }, + { + "completion_length": 1528.666748046875, + "epoch": 0.9576219512195122, + "grad_norm": 1.8220616970844123, + "kl": 0.061279296875, + "learning_rate": 1.6381957880188702e-08, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6282 + }, + { + "completion_length": 1366.3334350585938, + "epoch": 0.9577743902439024, + "grad_norm": 0.09263780916050088, + "kl": 0.060791015625, + "learning_rate": 1.6264527105527393e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6283 + }, + { + "completion_length": 794.3333740234375, + "epoch": 0.9579268292682926, + "grad_norm": 0.1474506261926991, + "kl": 0.0555419921875, + "learning_rate": 1.6147516441347822e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6284 + }, + { + "completion_length": 1111.5000610351562, + "epoch": 0.958079268292683, + "grad_norm": 1.6442749877129745, + "kl": 0.0538330078125, + "learning_rate": 1.6030925920780816e-08, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6285 + }, + { + "completion_length": 1120.8333435058594, + "epoch": 0.9582317073170732, + "grad_norm": 0.15338503512166382, + "kl": 0.04541015625, + "learning_rate": 1.591475557683847e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6286 + }, + { + "completion_length": 1026.8333435058594, + "epoch": 0.9583841463414634, + "grad_norm": 0.16558448296215514, + "kl": 0.0633544921875, + "learning_rate": 1.579900544241347e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6287 + }, + { + "completion_length": 3031.33349609375, + "epoch": 0.9585365853658536, + "grad_norm": 0.04750272383544022, + "kl": 0.0413818359375, + "learning_rate": 1.5683675550279943e-08, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6288 + }, + { + "completion_length": 1497.3333740234375, + "epoch": 0.958689024390244, + "grad_norm": 0.1873224608173453, + "kl": 0.08056640625, + "learning_rate": 1.5568765933092586e-08, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6289 + }, + { + "completion_length": 2191.8333740234375, + "epoch": 0.9588414634146342, + "grad_norm": 0.1421053499556629, + "kl": 0.0665283203125, + "learning_rate": 1.5454276623387552e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6290 + }, + { + "completion_length": 1528.8334350585938, + "epoch": 0.9589939024390244, + "grad_norm": 1.2390734996152404, + "kl": 0.0723876953125, + "learning_rate": 1.534020765358174e-08, + "loss": 0.0029, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6291 + }, + { + "completion_length": 961.1667175292969, + "epoch": 0.9591463414634146, + "grad_norm": 0.13082664207959285, + "kl": 0.064697265625, + "learning_rate": 1.5226559055972976e-08, + "loss": 0.0026, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 6292 + }, + { + "completion_length": 1080.1667175292969, + "epoch": 0.9592987804878049, + "grad_norm": 1.555460944781745, + "kl": 0.0584716796875, + "learning_rate": 1.5113330862740194e-08, + "loss": 0.0023, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 6293 + }, + { + "completion_length": 1388.3333435058594, + "epoch": 0.9594512195121951, + "grad_norm": 0.10674908782085762, + "kl": 0.0723876953125, + "learning_rate": 1.500052310594324e-08, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6294 + }, + { + "completion_length": 1990.5000610351562, + "epoch": 0.9596036585365854, + "grad_norm": 0.0963658561831865, + "kl": 0.048828125, + "learning_rate": 1.4888135817523074e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6295 + }, + { + "completion_length": 1588.166748046875, + "epoch": 0.9597560975609756, + "grad_norm": 0.09597381680934164, + "kl": 0.050537109375, + "learning_rate": 1.4776169029301234e-08, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6296 + }, + { + "completion_length": 922.6667175292969, + "epoch": 0.9599085365853659, + "grad_norm": 0.11234825584498159, + "kl": 0.04833984375, + "learning_rate": 1.4664622772980529e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6297 + }, + { + "completion_length": 1105.5000610351562, + "epoch": 0.9600609756097561, + "grad_norm": 0.160763734668816, + "kl": 0.056396484375, + "learning_rate": 1.4553497080144695e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6298 + }, + { + "completion_length": 925.3333740234375, + "epoch": 0.9602134146341463, + "grad_norm": 0.09671822019906004, + "kl": 0.066162109375, + "learning_rate": 1.444279198225823e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6299 + }, + { + "completion_length": 1787.6666870117188, + "epoch": 0.9603658536585366, + "grad_norm": 0.06357532231312973, + "kl": 0.0506591796875, + "learning_rate": 1.4332507510666393e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6300 + }, + { + "completion_length": 1586.6666870117188, + "epoch": 0.9605182926829269, + "grad_norm": 0.07694871088365135, + "kl": 0.06103515625, + "learning_rate": 1.4222643696595705e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6301 + }, + { + "completion_length": 2006.0, + "epoch": 0.9606707317073171, + "grad_norm": 0.08276324053687126, + "kl": 0.051513671875, + "learning_rate": 1.411320057115345e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6302 + }, + { + "completion_length": 2155.3333740234375, + "epoch": 0.9608231707317073, + "grad_norm": 0.06787126241883742, + "kl": 0.0501708984375, + "learning_rate": 1.4004178165327841e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6303 + }, + { + "completion_length": 1869.6667175292969, + "epoch": 0.9609756097560975, + "grad_norm": 0.10605879734394319, + "kl": 0.06494140625, + "learning_rate": 1.3895576509987685e-08, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6304 + }, + { + "completion_length": 856.3333435058594, + "epoch": 0.9611280487804879, + "grad_norm": 0.10752135431758884, + "kl": 0.05908203125, + "learning_rate": 1.3787395635883049e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6305 + }, + { + "completion_length": 952.6666870117188, + "epoch": 0.9612804878048781, + "grad_norm": 0.09605428022517414, + "kl": 0.0523681640625, + "learning_rate": 1.3679635573644433e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6306 + }, + { + "completion_length": 1717.3333740234375, + "epoch": 0.9614329268292683, + "grad_norm": 0.08079873326891518, + "kl": 0.0491943359375, + "learning_rate": 1.3572296353783764e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6307 + }, + { + "completion_length": 1776.0000610351562, + "epoch": 0.9615853658536585, + "grad_norm": 0.10817614100835128, + "kl": 0.0579833984375, + "learning_rate": 1.346537800669323e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6308 + }, + { + "completion_length": 1461.8333740234375, + "epoch": 0.9617378048780488, + "grad_norm": 0.07569196821721846, + "kl": 0.0604248046875, + "learning_rate": 1.3358880562646114e-08, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6309 + }, + { + "completion_length": 1161.6666870117188, + "epoch": 0.961890243902439, + "grad_norm": 0.07981547335453242, + "kl": 0.044677734375, + "learning_rate": 1.325280405179663e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6310 + }, + { + "completion_length": 1363.5000610351562, + "epoch": 0.9620426829268293, + "grad_norm": 0.11086356012216153, + "kl": 0.0618896484375, + "learning_rate": 1.3147148504179584e-08, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6311 + }, + { + "completion_length": 3075.8333740234375, + "epoch": 0.9621951219512195, + "grad_norm": 0.05060081407126424, + "kl": 0.0435791015625, + "learning_rate": 1.3041913949710715e-08, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6312 + }, + { + "completion_length": 1669.8333740234375, + "epoch": 0.9623475609756098, + "grad_norm": 0.07646125092469978, + "kl": 0.05615234375, + "learning_rate": 1.2937100418186521e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6313 + }, + { + "completion_length": 722.8333435058594, + "epoch": 0.9625, + "grad_norm": 0.1306118443881291, + "kl": 0.068603515625, + "learning_rate": 1.2832707939284426e-08, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6314 + }, + { + "completion_length": 875.3333740234375, + "epoch": 0.9626524390243902, + "grad_norm": 2.1355487892864273, + "kl": 0.0650634765625, + "learning_rate": 1.2728736542562292e-08, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6315 + }, + { + "completion_length": 1278.166748046875, + "epoch": 0.9628048780487805, + "grad_norm": 0.12964402758966082, + "kl": 0.051513671875, + "learning_rate": 1.2625186257459064e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6316 + }, + { + "completion_length": 1526.0000915527344, + "epoch": 0.9629573170731708, + "grad_norm": 0.08678378054530105, + "kl": 0.041259765625, + "learning_rate": 1.2522057113294461e-08, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6317 + }, + { + "completion_length": 1276.0, + "epoch": 0.963109756097561, + "grad_norm": 1.9602692977749572, + "kl": 0.0673828125, + "learning_rate": 1.2419349139268787e-08, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6318 + }, + { + "completion_length": 1592.6666870117188, + "epoch": 0.9632621951219512, + "grad_norm": 0.07397766615492786, + "kl": 0.0494384765625, + "learning_rate": 1.2317062364463117e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6319 + }, + { + "completion_length": 1163.6666870117188, + "epoch": 0.9634146341463414, + "grad_norm": 0.08346322791508216, + "kl": 0.058349609375, + "learning_rate": 1.2215196817839447e-08, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6320 + }, + { + "completion_length": 1670.8334350585938, + "epoch": 0.9635670731707318, + "grad_norm": 0.38713228442659553, + "kl": 0.0552978515625, + "learning_rate": 1.2113752528240207e-08, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6321 + }, + { + "completion_length": 1935.5000610351562, + "epoch": 0.963719512195122, + "grad_norm": 0.08718792129601223, + "kl": 0.046142578125, + "learning_rate": 1.2012729524388755e-08, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6322 + }, + { + "completion_length": 2543.3334350585938, + "epoch": 0.9638719512195122, + "grad_norm": 0.05831485941341898, + "kl": 0.0518798828125, + "learning_rate": 1.1912127834889375e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6323 + }, + { + "completion_length": 1515.8334350585938, + "epoch": 0.9640243902439024, + "grad_norm": 0.1368849501687979, + "kl": 0.0533447265625, + "learning_rate": 1.1811947488226282e-08, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6324 + }, + { + "completion_length": 803.8333435058594, + "epoch": 0.9641768292682927, + "grad_norm": 0.060771787025492935, + "kl": 0.0281982421875, + "learning_rate": 1.1712188512765453e-08, + "loss": 0.0011, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6325 + }, + { + "completion_length": 1710.6666870117188, + "epoch": 0.964329268292683, + "grad_norm": 1.4119223359041861, + "kl": 0.06494140625, + "learning_rate": 1.1612850936752961e-08, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6326 + }, + { + "completion_length": 761.0000305175781, + "epoch": 0.9644817073170732, + "grad_norm": 1.8223747327851638, + "kl": 0.0645751953125, + "learning_rate": 1.151393478831514e-08, + "loss": 0.0026, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6327 + }, + { + "completion_length": 3281.8333740234375, + "epoch": 0.9646341463414634, + "grad_norm": 0.04843201336434177, + "kl": 0.048583984375, + "learning_rate": 1.1415440095460083e-08, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6328 + }, + { + "completion_length": 1728.0000610351562, + "epoch": 0.9647865853658537, + "grad_norm": 0.09823486572477826, + "kl": 0.0513916015625, + "learning_rate": 1.1317366886075486e-08, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6329 + }, + { + "completion_length": 707.3333740234375, + "epoch": 0.9649390243902439, + "grad_norm": 0.0959979232456828, + "kl": 0.05224609375, + "learning_rate": 1.1219715187930468e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6330 + }, + { + "completion_length": 1899.5000457763672, + "epoch": 0.9650914634146341, + "grad_norm": 2.0135208797733988, + "kl": 0.056396484375, + "learning_rate": 1.1122485028674412e-08, + "loss": 0.0023, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 6331 + }, + { + "completion_length": 961.3333740234375, + "epoch": 0.9652439024390244, + "grad_norm": 0.10420116466944887, + "kl": 0.0599365234375, + "learning_rate": 1.1025676435837296e-08, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6332 + }, + { + "completion_length": 1383.3333740234375, + "epoch": 0.9653963414634147, + "grad_norm": 0.1360900904991828, + "kl": 0.048095703125, + "learning_rate": 1.0929289436830026e-08, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6333 + }, + { + "completion_length": 1079.3333740234375, + "epoch": 0.9655487804878049, + "grad_norm": 0.10477732658139094, + "kl": 0.0504150390625, + "learning_rate": 1.0833324058944105e-08, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6334 + }, + { + "completion_length": 1292.5000305175781, + "epoch": 0.9657012195121951, + "grad_norm": 0.38048825466699576, + "kl": 0.076904296875, + "learning_rate": 1.0737780329351299e-08, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6335 + }, + { + "completion_length": 1989.8333740234375, + "epoch": 0.9658536585365853, + "grad_norm": 0.1534059475759483, + "kl": 0.069091796875, + "learning_rate": 1.06426582751043e-08, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6336 + }, + { + "completion_length": 2510.6666870117188, + "epoch": 0.9660060975609757, + "grad_norm": 0.06320936586039491, + "kl": 0.0516357421875, + "learning_rate": 1.0547957923136398e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6337 + }, + { + "completion_length": 2220.6666870117188, + "epoch": 0.9661585365853659, + "grad_norm": 0.07451760100986249, + "kl": 0.04931640625, + "learning_rate": 1.0453679300261143e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6338 + }, + { + "completion_length": 1195.0, + "epoch": 0.9663109756097561, + "grad_norm": 0.07769491064085755, + "kl": 0.051513671875, + "learning_rate": 1.035982243317335e-08, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6339 + }, + { + "completion_length": 2210.166748046875, + "epoch": 0.9664634146341463, + "grad_norm": 0.09857305176662823, + "kl": 0.0499267578125, + "learning_rate": 1.0266387348447758e-08, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6340 + }, + { + "completion_length": 628.6666717529297, + "epoch": 0.9666158536585366, + "grad_norm": 0.22593048103889077, + "kl": 0.0555419921875, + "learning_rate": 1.0173374072539876e-08, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6341 + }, + { + "completion_length": 794.0000305175781, + "epoch": 0.9667682926829269, + "grad_norm": 0.10931042708046561, + "kl": 0.0570068359375, + "learning_rate": 1.0080782631785968e-08, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6342 + }, + { + "completion_length": 1319.3333740234375, + "epoch": 0.9669207317073171, + "grad_norm": 1.292672269384902, + "kl": 0.05322265625, + "learning_rate": 9.988613052402729e-09, + "loss": 0.0021, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6343 + }, + { + "completion_length": 1841.5, + "epoch": 0.9670731707317073, + "grad_norm": 0.07289692601083929, + "kl": 0.050537109375, + "learning_rate": 9.896865360487451e-09, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6344 + }, + { + "completion_length": 2485.666748046875, + "epoch": 0.9672256097560976, + "grad_norm": 0.07632725325357335, + "kl": 0.052978515625, + "learning_rate": 9.805539582017686e-09, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6345 + }, + { + "completion_length": 1265.666748046875, + "epoch": 0.9673780487804878, + "grad_norm": 0.09642792001359991, + "kl": 0.0472412109375, + "learning_rate": 9.714635742851919e-09, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6346 + }, + { + "completion_length": 929.6666870117188, + "epoch": 0.967530487804878, + "grad_norm": 0.10039533211465147, + "kl": 0.05615234375, + "learning_rate": 9.624153868729057e-09, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6347 + }, + { + "completion_length": 1159.8333587646484, + "epoch": 0.9676829268292683, + "grad_norm": 2.455987741134173, + "kl": 0.077392578125, + "learning_rate": 9.534093985268444e-09, + "loss": 0.0031, + "reward": 0.5000000149011612, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.5000000149011612, + "rewards/format_reward": 0.0, + "step": 6348 + }, + { + "completion_length": 1171.0000610351562, + "epoch": 0.9678353658536586, + "grad_norm": 0.09601896700102382, + "kl": 0.04290771484375, + "learning_rate": 9.444456117969847e-09, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6349 + }, + { + "completion_length": 1619.5000610351562, + "epoch": 0.9679878048780488, + "grad_norm": 0.06559363629367013, + "kl": 0.0369873046875, + "learning_rate": 9.355240292213796e-09, + "loss": 0.0015, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6350 + }, + { + "completion_length": 1128.1666870117188, + "epoch": 0.968140243902439, + "grad_norm": 0.12457064002478264, + "kl": 0.0855712890625, + "learning_rate": 9.266446533261252e-09, + "loss": 0.0034, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6351 + }, + { + "completion_length": 1507.0000915527344, + "epoch": 0.9682926829268292, + "grad_norm": 0.15085746830991373, + "kl": 0.064697265625, + "learning_rate": 9.178074866253605e-09, + "loss": 0.0026, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6352 + }, + { + "completion_length": 2502.83349609375, + "epoch": 0.9684451219512196, + "grad_norm": 0.09554432028016809, + "kl": 0.068115234375, + "learning_rate": 9.090125316212506e-09, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6353 + }, + { + "completion_length": 1975.0001220703125, + "epoch": 0.9685975609756098, + "grad_norm": 0.058381607598538846, + "kl": 0.0482177734375, + "learning_rate": 9.002597908040534e-09, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6354 + }, + { + "completion_length": 867.0000305175781, + "epoch": 0.96875, + "grad_norm": 0.09904823926249665, + "kl": 0.04638671875, + "learning_rate": 8.91549266652053e-09, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6355 + }, + { + "completion_length": 2519.0, + "epoch": 0.9689024390243902, + "grad_norm": 0.0914043746001704, + "kl": 0.05419921875, + "learning_rate": 8.82880961631577e-09, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6356 + }, + { + "completion_length": 966.1666870117188, + "epoch": 0.9690548780487804, + "grad_norm": 0.2288749909402434, + "kl": 0.0751953125, + "learning_rate": 8.742548781970117e-09, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6357 + }, + { + "completion_length": 1816.5, + "epoch": 0.9692073170731708, + "grad_norm": 0.17371037051599492, + "kl": 0.068115234375, + "learning_rate": 8.656710187907536e-09, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6358 + }, + { + "completion_length": 1430.0000305175781, + "epoch": 0.969359756097561, + "grad_norm": 1.416144034832163, + "kl": 0.05517578125, + "learning_rate": 8.571293858432916e-09, + "loss": 0.0022, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 6359 + }, + { + "completion_length": 1737.5001220703125, + "epoch": 0.9695121951219512, + "grad_norm": 1.5039285289968416, + "kl": 0.0478515625, + "learning_rate": 8.486299817731412e-09, + "loss": 0.0019, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6360 + }, + { + "completion_length": 1520.666748046875, + "epoch": 0.9696646341463414, + "grad_norm": 1.3845090633032489, + "kl": 0.0555419921875, + "learning_rate": 8.401728089868277e-09, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6361 + }, + { + "completion_length": 2316.33349609375, + "epoch": 0.9698170731707317, + "grad_norm": 0.07520462991700437, + "kl": 0.0501708984375, + "learning_rate": 8.317578698789684e-09, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6362 + }, + { + "completion_length": 1891.0, + "epoch": 0.969969512195122, + "grad_norm": 1.2809635193452202, + "kl": 0.0648193359375, + "learning_rate": 8.233851668321913e-09, + "loss": 0.0026, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6363 + }, + { + "completion_length": 2276.0001220703125, + "epoch": 0.9701219512195122, + "grad_norm": 0.06545984930481681, + "kl": 0.04345703125, + "learning_rate": 8.150547022171828e-09, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6364 + }, + { + "completion_length": 1287.0, + "epoch": 0.9702743902439024, + "grad_norm": 0.18066686828765885, + "kl": 0.0570068359375, + "learning_rate": 8.067664783926731e-09, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6365 + }, + { + "completion_length": 1471.6666870117188, + "epoch": 0.9704268292682927, + "grad_norm": 0.10946281708847105, + "kl": 0.078125, + "learning_rate": 7.985204977054017e-09, + "loss": 0.0031, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6366 + }, + { + "completion_length": 1912.166748046875, + "epoch": 0.9705792682926829, + "grad_norm": 0.9853963112254226, + "kl": 0.05078125, + "learning_rate": 7.903167624901508e-09, + "loss": 0.002, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6367 + }, + { + "completion_length": 1056.5000305175781, + "epoch": 0.9707317073170731, + "grad_norm": 1.3573061021244996, + "kl": 0.0572509765625, + "learning_rate": 7.821552750697958e-09, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6368 + }, + { + "completion_length": 1690.0000610351562, + "epoch": 0.9708841463414634, + "grad_norm": 0.08389090826306934, + "kl": 0.0526123046875, + "learning_rate": 7.74036037755188e-09, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6369 + }, + { + "completion_length": 639.6666870117188, + "epoch": 0.9710365853658537, + "grad_norm": 1.8044523039407454, + "kl": 0.091064453125, + "learning_rate": 7.659590528452554e-09, + "loss": 0.0036, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6370 + }, + { + "completion_length": 2313.0000610351562, + "epoch": 0.9711890243902439, + "grad_norm": 0.08393037858305087, + "kl": 0.0518798828125, + "learning_rate": 7.57924322626935e-09, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6371 + }, + { + "completion_length": 1386.3333740234375, + "epoch": 0.9713414634146341, + "grad_norm": 0.08109866137232985, + "kl": 0.0430908203125, + "learning_rate": 7.499318493751905e-09, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6372 + }, + { + "completion_length": 863.8333740234375, + "epoch": 0.9714939024390243, + "grad_norm": 0.09630918160386971, + "kl": 0.0648193359375, + "learning_rate": 7.419816353530784e-09, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6373 + }, + { + "completion_length": 743.5000305175781, + "epoch": 0.9716463414634147, + "grad_norm": 0.10805940813542103, + "kl": 0.053466796875, + "learning_rate": 7.3407368281164785e-09, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6374 + }, + { + "completion_length": 858.5000305175781, + "epoch": 0.9717987804878049, + "grad_norm": 0.07417173657015567, + "kl": 0.04541015625, + "learning_rate": 7.262079939899579e-09, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6375 + }, + { + "completion_length": 1712.1666870117188, + "epoch": 0.9719512195121951, + "grad_norm": 0.07371275564086882, + "kl": 0.0574951171875, + "learning_rate": 7.1838457111516044e-09, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6376 + }, + { + "completion_length": 1850.3333740234375, + "epoch": 0.9721036585365853, + "grad_norm": 0.06643473154875269, + "kl": 0.0582275390625, + "learning_rate": 7.106034164023833e-09, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6377 + }, + { + "completion_length": 1384.6666870117188, + "epoch": 0.9722560975609756, + "grad_norm": 0.0923039641595129, + "kl": 0.0545654296875, + "learning_rate": 7.028645320548144e-09, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6378 + }, + { + "completion_length": 1527.5000610351562, + "epoch": 0.9724085365853659, + "grad_norm": 0.08069220467846999, + "kl": 0.0494384765625, + "learning_rate": 6.951679202637007e-09, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6379 + }, + { + "completion_length": 1482.666748046875, + "epoch": 0.9725609756097561, + "grad_norm": 0.09022309490064552, + "kl": 0.060302734375, + "learning_rate": 6.875135832082657e-09, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6380 + }, + { + "completion_length": 2276.5000610351562, + "epoch": 0.9727134146341463, + "grad_norm": 0.075422398114749, + "kl": 0.045166015625, + "learning_rate": 6.7990152305579255e-09, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6381 + }, + { + "completion_length": 1473.3333435058594, + "epoch": 0.9728658536585366, + "grad_norm": 0.07913859170360517, + "kl": 0.0517578125, + "learning_rate": 6.723317419615737e-09, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6382 + }, + { + "completion_length": 1854.6666870117188, + "epoch": 0.9730182926829268, + "grad_norm": 0.1505925699031243, + "kl": 0.0452880859375, + "learning_rate": 6.648042420689615e-09, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6383 + }, + { + "completion_length": 1823.3333740234375, + "epoch": 0.973170731707317, + "grad_norm": 0.07127649410688208, + "kl": 0.0550537109375, + "learning_rate": 6.573190255093342e-09, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6384 + }, + { + "completion_length": 2914.0001220703125, + "epoch": 0.9733231707317073, + "grad_norm": 0.08112771779695614, + "kl": 0.0498046875, + "learning_rate": 6.498760944020465e-09, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6385 + }, + { + "completion_length": 1372.3333740234375, + "epoch": 0.9734756097560976, + "grad_norm": 1.492662775113769, + "kl": 0.0665283203125, + "learning_rate": 6.424754508545627e-09, + "loss": 0.0027, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6386 + }, + { + "completion_length": 2131.5001220703125, + "epoch": 0.9736280487804878, + "grad_norm": 0.1092487428272197, + "kl": 0.060302734375, + "learning_rate": 6.3511709696229e-09, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6387 + }, + { + "completion_length": 1309.3333740234375, + "epoch": 0.973780487804878, + "grad_norm": 0.1257402883259211, + "kl": 0.072265625, + "learning_rate": 6.278010348087282e-09, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6388 + }, + { + "completion_length": 723.8333435058594, + "epoch": 0.9739329268292682, + "grad_norm": 0.11301152761495746, + "kl": 0.06640625, + "learning_rate": 6.2052726646535385e-09, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6389 + }, + { + "completion_length": 1108.5000610351562, + "epoch": 0.9740853658536586, + "grad_norm": 0.08765918905089243, + "kl": 0.062744140625, + "learning_rate": 6.1329579399171945e-09, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6390 + }, + { + "completion_length": 2692.6666870117188, + "epoch": 0.9742378048780488, + "grad_norm": 0.2133757003794908, + "kl": 0.0560302734375, + "learning_rate": 6.061066194353371e-09, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6391 + }, + { + "completion_length": 1210.5000610351562, + "epoch": 0.974390243902439, + "grad_norm": 0.12895106178377197, + "kl": 0.0633544921875, + "learning_rate": 5.989597448317785e-09, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6392 + }, + { + "completion_length": 1842.8333435058594, + "epoch": 0.9745426829268292, + "grad_norm": 0.06654201345732255, + "kl": 0.04345703125, + "learning_rate": 5.91855172204675e-09, + "loss": 0.0017, + "reward": 1.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 1.0, + "rewards/format_reward": 0.0, + "step": 6393 + }, + { + "completion_length": 966.1667175292969, + "epoch": 0.9746951219512195, + "grad_norm": 0.14553395529239496, + "kl": 0.0704345703125, + "learning_rate": 5.847929035656008e-09, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6394 + }, + { + "completion_length": 1357.6666870117188, + "epoch": 0.9748475609756098, + "grad_norm": 0.07931302440842385, + "kl": 0.0509033203125, + "learning_rate": 5.7777294091422295e-09, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6395 + }, + { + "completion_length": 1666.0000610351562, + "epoch": 0.975, + "grad_norm": 0.13067112267761538, + "kl": 0.07958984375, + "learning_rate": 5.707952862381682e-09, + "loss": 0.0032, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6396 + }, + { + "completion_length": 2098.3333740234375, + "epoch": 0.9751524390243902, + "grad_norm": 0.11216932161695733, + "kl": 0.0684814453125, + "learning_rate": 5.638599415131563e-09, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6397 + }, + { + "completion_length": 1907.6666870117188, + "epoch": 0.9753048780487805, + "grad_norm": 1.2612369753292645, + "kl": 0.064697265625, + "learning_rate": 5.569669087028495e-09, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6398 + }, + { + "completion_length": 1395.1666870117188, + "epoch": 0.9754573170731707, + "grad_norm": 0.13172634791996354, + "kl": 0.0506591796875, + "learning_rate": 5.501161897589868e-09, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6399 + }, + { + "completion_length": 842.1666870117188, + "epoch": 0.975609756097561, + "grad_norm": 0.1490151528136116, + "kl": 0.047119140625, + "learning_rate": 5.433077866212999e-09, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6400 + }, + { + "completion_length": 1040.6666870117188, + "epoch": 0.9757621951219512, + "grad_norm": 0.2895050156158568, + "kl": 0.0687255859375, + "learning_rate": 5.365417012175467e-09, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6401 + }, + { + "completion_length": 1206.5, + "epoch": 0.9759146341463415, + "grad_norm": 0.1452454183533058, + "kl": 0.063232421875, + "learning_rate": 5.298179354635113e-09, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6402 + }, + { + "completion_length": 1885.0000915527344, + "epoch": 0.9760670731707317, + "grad_norm": 1.541977625629881, + "kl": 0.05859375, + "learning_rate": 5.231364912629877e-09, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6403 + }, + { + "completion_length": 1114.5, + "epoch": 0.9762195121951219, + "grad_norm": 0.14771542471033275, + "kl": 0.0728759765625, + "learning_rate": 5.164973705077624e-09, + "loss": 0.0029, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6404 + }, + { + "completion_length": 1313.3333740234375, + "epoch": 0.9763719512195121, + "grad_norm": 0.09181667365846417, + "kl": 0.0614013671875, + "learning_rate": 5.0990057507768194e-09, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6405 + }, + { + "completion_length": 2075.666748046875, + "epoch": 0.9765243902439025, + "grad_norm": 1.1866626364111876, + "kl": 0.0594482421875, + "learning_rate": 5.033461068405854e-09, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6406 + }, + { + "completion_length": 1268.1666870117188, + "epoch": 0.9766768292682927, + "grad_norm": 0.09231572178780163, + "kl": 0.0533447265625, + "learning_rate": 4.968339676523215e-09, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6407 + }, + { + "completion_length": 938.1666870117188, + "epoch": 0.9768292682926829, + "grad_norm": 0.10210994164625173, + "kl": 0.037353515625, + "learning_rate": 4.903641593567654e-09, + "loss": 0.0015, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6408 + }, + { + "completion_length": 1082.3333435058594, + "epoch": 0.9769817073170731, + "grad_norm": 0.08865219714479923, + "kl": 0.0577392578125, + "learning_rate": 4.839366837858017e-09, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6409 + }, + { + "completion_length": 1039.5, + "epoch": 0.9771341463414634, + "grad_norm": 0.24817561751413938, + "kl": 0.0721435546875, + "learning_rate": 4.775515427593247e-09, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6410 + }, + { + "completion_length": 1828.666748046875, + "epoch": 0.9772865853658537, + "grad_norm": 0.14091382697961116, + "kl": 0.06640625, + "learning_rate": 4.712087380852881e-09, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6411 + }, + { + "completion_length": 1708.0, + "epoch": 0.9774390243902439, + "grad_norm": 0.11570368068652973, + "kl": 0.060302734375, + "learning_rate": 4.649082715595554e-09, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6412 + }, + { + "completion_length": 1345.8333740234375, + "epoch": 0.9775914634146341, + "grad_norm": 0.10100914066730972, + "kl": 0.06201171875, + "learning_rate": 4.586501449660996e-09, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6413 + }, + { + "completion_length": 1259.6666870117188, + "epoch": 0.9777439024390244, + "grad_norm": 1.270169478713776, + "kl": 0.0560302734375, + "learning_rate": 4.524343600768699e-09, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6414 + }, + { + "completion_length": 674.5, + "epoch": 0.9778963414634146, + "grad_norm": 0.08994028565699767, + "kl": 0.04559326171875, + "learning_rate": 4.462609186518252e-09, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6415 + }, + { + "completion_length": 933.8333740234375, + "epoch": 0.9780487804878049, + "grad_norm": 0.07823854552300569, + "kl": 0.034912109375, + "learning_rate": 4.401298224389338e-09, + "loss": 0.0014, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6416 + }, + { + "completion_length": 2671.666748046875, + "epoch": 0.9782012195121951, + "grad_norm": 0.1744697202542026, + "kl": 0.0517578125, + "learning_rate": 4.34041073174174e-09, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6417 + }, + { + "completion_length": 1928.8333740234375, + "epoch": 0.9783536585365854, + "grad_norm": 0.07542495383906822, + "kl": 0.0516357421875, + "learning_rate": 4.279946725815498e-09, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6418 + }, + { + "completion_length": 1605.1666870117188, + "epoch": 0.9785060975609756, + "grad_norm": 0.08469216594548756, + "kl": 0.0504150390625, + "learning_rate": 4.2199062237304186e-09, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6419 + }, + { + "completion_length": 709.3333435058594, + "epoch": 0.9786585365853658, + "grad_norm": 2.0541295371405908, + "kl": 0.0673828125, + "learning_rate": 4.160289242486737e-09, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6420 + }, + { + "completion_length": 1721.5000610351562, + "epoch": 0.978810975609756, + "grad_norm": 2.1171571034891525, + "kl": 0.07568359375, + "learning_rate": 4.101095798964616e-09, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6421 + }, + { + "completion_length": 1543.5, + "epoch": 0.9789634146341464, + "grad_norm": 0.08896326740663747, + "kl": 0.06103515625, + "learning_rate": 4.042325909924316e-09, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6422 + }, + { + "completion_length": 932.3333435058594, + "epoch": 0.9791158536585366, + "grad_norm": 0.08709660514058969, + "kl": 0.0528564453125, + "learning_rate": 3.983979592006026e-09, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6423 + }, + { + "completion_length": 1298.5, + "epoch": 0.9792682926829268, + "grad_norm": 0.08670411396396621, + "kl": 0.061767578125, + "learning_rate": 3.926056861730532e-09, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6424 + }, + { + "completion_length": 887.1666717529297, + "epoch": 0.979420731707317, + "grad_norm": 0.12259217864019457, + "kl": 0.09619140625, + "learning_rate": 3.86855773549788e-09, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6425 + }, + { + "completion_length": 1665.8333740234375, + "epoch": 0.9795731707317074, + "grad_norm": 0.10332988170301788, + "kl": 0.05810546875, + "learning_rate": 3.811482229588714e-09, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6426 + }, + { + "completion_length": 1348.6667175292969, + "epoch": 0.9797256097560976, + "grad_norm": 0.1692220039873087, + "kl": 0.07861328125, + "learning_rate": 3.754830360163608e-09, + "loss": 0.0031, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6427 + }, + { + "completion_length": 981.1667175292969, + "epoch": 0.9798780487804878, + "grad_norm": 0.08589264591410063, + "kl": 0.0411376953125, + "learning_rate": 3.6986021432633967e-09, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6428 + }, + { + "completion_length": 898.1666870117188, + "epoch": 0.980030487804878, + "grad_norm": 0.09228012922524673, + "kl": 0.059326171875, + "learning_rate": 3.6427975948085114e-09, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6429 + }, + { + "completion_length": 2348.0001220703125, + "epoch": 0.9801829268292683, + "grad_norm": 1.3167725557854075, + "kl": 0.0517578125, + "learning_rate": 3.5874167305996465e-09, + "loss": 0.0021, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6430 + }, + { + "completion_length": 1485.3334350585938, + "epoch": 0.9803353658536585, + "grad_norm": 0.19134205794649584, + "kl": 0.076171875, + "learning_rate": 3.5324595663175916e-09, + "loss": 0.003, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6431 + }, + { + "completion_length": 1809.3333435058594, + "epoch": 0.9804878048780488, + "grad_norm": 0.0673568364087787, + "kl": 0.05126953125, + "learning_rate": 3.4779261175232334e-09, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6432 + }, + { + "completion_length": 1223.1666870117188, + "epoch": 0.980640243902439, + "grad_norm": 0.12012103629773813, + "kl": 0.066650390625, + "learning_rate": 3.4238163996573867e-09, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6433 + }, + { + "completion_length": 1129.5000610351562, + "epoch": 0.9807926829268293, + "grad_norm": 0.10811802676240459, + "kl": 0.06298828125, + "learning_rate": 3.37013042804063e-09, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6434 + }, + { + "completion_length": 818.1666870117188, + "epoch": 0.9809451219512195, + "grad_norm": 0.14524504317700507, + "kl": 0.066650390625, + "learning_rate": 3.316868217874136e-09, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6435 + }, + { + "completion_length": 1910.6667175292969, + "epoch": 0.9810975609756097, + "grad_norm": 0.08419342686062316, + "kl": 0.0528564453125, + "learning_rate": 3.2640297842385092e-09, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6436 + }, + { + "completion_length": 1040.6666870117188, + "epoch": 0.98125, + "grad_norm": 0.106677524762748, + "kl": 0.0570068359375, + "learning_rate": 3.2116151420947815e-09, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6437 + }, + { + "completion_length": 1774.5000610351562, + "epoch": 0.9814024390243903, + "grad_norm": 2.218084823590198, + "kl": 0.0677490234375, + "learning_rate": 3.1596243062837483e-09, + "loss": 0.0027, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6438 + }, + { + "completion_length": 1019.0000305175781, + "epoch": 0.9815548780487805, + "grad_norm": 2.0009254472005877, + "kl": 0.0740966796875, + "learning_rate": 3.1080572915263007e-09, + "loss": 0.003, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6439 + }, + { + "completion_length": 851.6666870117188, + "epoch": 0.9817073170731707, + "grad_norm": 0.10922507812487445, + "kl": 0.0535888671875, + "learning_rate": 3.0569141124234256e-09, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6440 + }, + { + "completion_length": 1597.8333435058594, + "epoch": 0.9818597560975609, + "grad_norm": 0.8682976571449089, + "kl": 0.0684814453125, + "learning_rate": 3.0061947834558735e-09, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6441 + }, + { + "completion_length": 655.3333435058594, + "epoch": 0.9820121951219513, + "grad_norm": 0.1019356951232059, + "kl": 0.07080078125, + "learning_rate": 2.955899318984656e-09, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6442 + }, + { + "completion_length": 2653.8333740234375, + "epoch": 0.9821646341463415, + "grad_norm": 0.07810056774720084, + "kl": 0.0513916015625, + "learning_rate": 2.906027733250383e-09, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6443 + }, + { + "completion_length": 1019.8333740234375, + "epoch": 0.9823170731707317, + "grad_norm": 0.08118330127455897, + "kl": 0.0518798828125, + "learning_rate": 2.8565800403740906e-09, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6444 + }, + { + "completion_length": 1238.0, + "epoch": 0.9824695121951219, + "grad_norm": 0.12499816426579999, + "kl": 0.070068359375, + "learning_rate": 2.807556254356414e-09, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6445 + }, + { + "completion_length": 1594.1667175292969, + "epoch": 0.9826219512195122, + "grad_norm": 2.5113850149796253, + "kl": 0.0765380859375, + "learning_rate": 2.7589563890782487e-09, + "loss": 0.0031, + "reward": 0.3333333432674408, + "reward_std": 0.5773502588272095, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6446 + }, + { + "completion_length": 1623.1666870117188, + "epoch": 0.9827743902439025, + "grad_norm": 0.10759922748761036, + "kl": 0.070068359375, + "learning_rate": 2.7107804583004214e-09, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6447 + }, + { + "completion_length": 1287.0, + "epoch": 0.9829268292682927, + "grad_norm": 0.22967664314134847, + "kl": 0.05615234375, + "learning_rate": 2.6630284756635204e-09, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6448 + }, + { + "completion_length": 1225.3333740234375, + "epoch": 0.9830792682926829, + "grad_norm": 0.08879117418271414, + "kl": 0.0596923828125, + "learning_rate": 2.6157004546882303e-09, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6449 + }, + { + "completion_length": 1684.6666870117188, + "epoch": 0.9832317073170732, + "grad_norm": 0.0687781832736959, + "kl": 0.048095703125, + "learning_rate": 2.5687964087751647e-09, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6450 + }, + { + "completion_length": 1223.8333435058594, + "epoch": 0.9833841463414634, + "grad_norm": 1.38097448591535, + "kl": 0.069580078125, + "learning_rate": 2.522316351205034e-09, + "loss": 0.0028, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6451 + }, + { + "completion_length": 2930.666748046875, + "epoch": 0.9835365853658536, + "grad_norm": 0.03721261457472512, + "kl": 0.0411376953125, + "learning_rate": 2.4762602951383104e-09, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6452 + }, + { + "completion_length": 595.1666717529297, + "epoch": 0.9836890243902439, + "grad_norm": 0.11985042731979623, + "kl": 0.079345703125, + "learning_rate": 2.430628253615397e-09, + "loss": 0.0032, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6453 + }, + { + "completion_length": 943.5000610351562, + "epoch": 0.9838414634146342, + "grad_norm": 0.12088426010342616, + "kl": 0.0606689453125, + "learning_rate": 2.3854202395567905e-09, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6454 + }, + { + "completion_length": 1590.5, + "epoch": 0.9839939024390244, + "grad_norm": 0.12864948864380552, + "kl": 0.070068359375, + "learning_rate": 2.3406362657629187e-09, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6455 + }, + { + "completion_length": 1489.3333740234375, + "epoch": 0.9841463414634146, + "grad_norm": 0.09350435946419104, + "kl": 0.0537109375, + "learning_rate": 2.2962763449141387e-09, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6456 + }, + { + "completion_length": 1563.3333740234375, + "epoch": 0.9842987804878048, + "grad_norm": 0.06969558368756518, + "kl": 0.044921875, + "learning_rate": 2.25234048957057e-09, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6457 + }, + { + "completion_length": 926.6666870117188, + "epoch": 0.9844512195121952, + "grad_norm": 0.12732537726152882, + "kl": 0.068359375, + "learning_rate": 2.208828712172262e-09, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6458 + }, + { + "completion_length": 759.8333740234375, + "epoch": 0.9846036585365854, + "grad_norm": 1.9641098265053776, + "kl": 0.066650390625, + "learning_rate": 2.1657410250393584e-09, + "loss": 0.0027, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6459 + }, + { + "completion_length": 2387.8333435058594, + "epoch": 0.9847560975609756, + "grad_norm": 0.11095717780793751, + "kl": 0.064208984375, + "learning_rate": 2.123077440372101e-09, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6460 + }, + { + "completion_length": 1510.1666870117188, + "epoch": 0.9849085365853658, + "grad_norm": 0.14974773571971503, + "kl": 0.0672607421875, + "learning_rate": 2.08083797025016e-09, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6461 + }, + { + "completion_length": 1724.666748046875, + "epoch": 0.9850609756097561, + "grad_norm": 0.1129718355112615, + "kl": 0.0498046875, + "learning_rate": 2.0390226266336352e-09, + "loss": 0.002, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6462 + }, + { + "completion_length": 1730.5, + "epoch": 0.9852134146341464, + "grad_norm": 0.06130620273962867, + "kl": 0.04345703125, + "learning_rate": 1.9976314213620563e-09, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6463 + }, + { + "completion_length": 806.3333435058594, + "epoch": 0.9853658536585366, + "grad_norm": 0.11695884940211533, + "kl": 0.06005859375, + "learning_rate": 1.9566643661550478e-09, + "loss": 0.0024, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6464 + }, + { + "completion_length": 1945.6666870117188, + "epoch": 0.9855182926829268, + "grad_norm": 1.656329261817117, + "kl": 0.056396484375, + "learning_rate": 1.916121472612331e-09, + "loss": 0.0023, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6465 + }, + { + "completion_length": 1253.5000610351562, + "epoch": 0.9856707317073171, + "grad_norm": 0.13090590582371048, + "kl": 0.042724609375, + "learning_rate": 1.8760027522133903e-09, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6466 + }, + { + "completion_length": 1212.3333435058594, + "epoch": 0.9858231707317073, + "grad_norm": 0.08686926037672804, + "kl": 0.061767578125, + "learning_rate": 1.8363082163174727e-09, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6467 + }, + { + "completion_length": 1280.0, + "epoch": 0.9859756097560975, + "grad_norm": 0.12817224852232387, + "kl": 0.068115234375, + "learning_rate": 1.7970378761639206e-09, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6468 + }, + { + "completion_length": 1362.3333435058594, + "epoch": 0.9861280487804878, + "grad_norm": 0.15469194618778673, + "kl": 0.0511474609375, + "learning_rate": 1.7581917428720062e-09, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6469 + }, + { + "completion_length": 1206.0, + "epoch": 0.9862804878048781, + "grad_norm": 0.1698549838430996, + "kl": 0.0709228515625, + "learning_rate": 1.7197698274404317e-09, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6470 + }, + { + "completion_length": 1965.166748046875, + "epoch": 0.9864329268292683, + "grad_norm": 0.0870583317354827, + "kl": 0.052490234375, + "learning_rate": 1.6817721407483276e-09, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6471 + }, + { + "completion_length": 1375.3333740234375, + "epoch": 0.9865853658536585, + "grad_norm": 0.09369520547603186, + "kl": 0.0487060546875, + "learning_rate": 1.6441986935545884e-09, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6472 + }, + { + "completion_length": 997.1667175292969, + "epoch": 0.9867378048780487, + "grad_norm": 0.09118892969804851, + "kl": 0.06982421875, + "learning_rate": 1.6070494964978699e-09, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6473 + }, + { + "completion_length": 832.3333740234375, + "epoch": 0.9868902439024391, + "grad_norm": 0.1122924533836075, + "kl": 0.0460205078125, + "learning_rate": 1.570324560096592e-09, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6474 + }, + { + "completion_length": 1339.3333740234375, + "epoch": 0.9870426829268293, + "grad_norm": 0.3108872366217068, + "kl": 0.0570068359375, + "learning_rate": 1.5340238947492701e-09, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6475 + }, + { + "completion_length": 1113.8333740234375, + "epoch": 0.9871951219512195, + "grad_norm": 0.06936600386891499, + "kl": 0.0419921875, + "learning_rate": 1.4981475107341825e-09, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6476 + }, + { + "completion_length": 1517.5, + "epoch": 0.9873475609756097, + "grad_norm": 0.1005868518283825, + "kl": 0.0655517578125, + "learning_rate": 1.4626954182095365e-09, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6477 + }, + { + "completion_length": 811.3333740234375, + "epoch": 0.9875, + "grad_norm": 0.1743267327739878, + "kl": 0.044677734375, + "learning_rate": 1.4276676272133026e-09, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6478 + }, + { + "completion_length": 1041.5000305175781, + "epoch": 0.9876524390243903, + "grad_norm": 0.12960042549341785, + "kl": 0.0511474609375, + "learning_rate": 1.3930641476635475e-09, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6479 + }, + { + "completion_length": 783.8333435058594, + "epoch": 0.9878048780487805, + "grad_norm": 2.1122084734950834, + "kl": 0.080810546875, + "learning_rate": 1.3588849893579336e-09, + "loss": 0.0032, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6480 + }, + { + "completion_length": 1537.6666870117188, + "epoch": 0.9879573170731707, + "grad_norm": 0.07993546876809046, + "kl": 0.043701171875, + "learning_rate": 1.3251301619742196e-09, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6481 + }, + { + "completion_length": 614.3333435058594, + "epoch": 0.988109756097561, + "grad_norm": 2.2279509851573587, + "kl": 0.068115234375, + "learning_rate": 1.2917996750695937e-09, + "loss": 0.0027, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6482 + }, + { + "completion_length": 1706.5, + "epoch": 0.9882621951219512, + "grad_norm": 0.05825731719710079, + "kl": 0.055908203125, + "learning_rate": 1.258893538081507e-09, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6483 + }, + { + "completion_length": 870.1666870117188, + "epoch": 0.9884146341463415, + "grad_norm": 0.10456628880033637, + "kl": 0.056640625, + "learning_rate": 1.226411760327173e-09, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6484 + }, + { + "completion_length": 988.8333435058594, + "epoch": 0.9885670731707317, + "grad_norm": 0.20214403205271161, + "kl": 0.0615234375, + "learning_rate": 1.1943543510035683e-09, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6485 + }, + { + "completion_length": 841.8333435058594, + "epoch": 0.988719512195122, + "grad_norm": 0.08759677325884421, + "kl": 0.04437255859375, + "learning_rate": 1.1627213191875984e-09, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6486 + }, + { + "completion_length": 2422.0, + "epoch": 0.9888719512195122, + "grad_norm": 1.534011415446628, + "kl": 0.0611572265625, + "learning_rate": 1.1315126738359327e-09, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6487 + }, + { + "completion_length": 1687.5000610351562, + "epoch": 0.9890243902439024, + "grad_norm": 0.08481095282543759, + "kl": 0.050048828125, + "learning_rate": 1.1007284237850025e-09, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6488 + }, + { + "completion_length": 1930.8333740234375, + "epoch": 0.9891768292682926, + "grad_norm": 0.08454845078912872, + "kl": 0.05908203125, + "learning_rate": 1.070368577751335e-09, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6489 + }, + { + "completion_length": 1703.3333740234375, + "epoch": 0.989329268292683, + "grad_norm": 0.12473003521272237, + "kl": 0.058349609375, + "learning_rate": 1.040433144330888e-09, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6490 + }, + { + "completion_length": 1314.166748046875, + "epoch": 0.9894817073170732, + "grad_norm": 0.08680703764375466, + "kl": 0.0421142578125, + "learning_rate": 1.0109221320000473e-09, + "loss": 0.0017, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6491 + }, + { + "completion_length": 1556.666748046875, + "epoch": 0.9896341463414634, + "grad_norm": 0.08490221950794956, + "kl": 0.0533447265625, + "learning_rate": 9.818355491144626e-10, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6492 + }, + { + "completion_length": 1904.3334350585938, + "epoch": 0.9897865853658536, + "grad_norm": 0.06962363976811453, + "kl": 0.044921875, + "learning_rate": 9.531734039098795e-10, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6493 + }, + { + "completion_length": 913.5000305175781, + "epoch": 0.989939024390244, + "grad_norm": 0.0818831936706604, + "kl": 0.0570068359375, + "learning_rate": 9.249357045016393e-10, + "loss": 0.0023, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6494 + }, + { + "completion_length": 2130.3334350585938, + "epoch": 0.9900914634146342, + "grad_norm": 0.07319926477020883, + "kl": 0.0469970703125, + "learning_rate": 8.971224588853466e-10, + "loss": 0.0019, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6495 + }, + { + "completion_length": 1659.3333740234375, + "epoch": 0.9902439024390244, + "grad_norm": 0.22493983780686555, + "kl": 0.081298828125, + "learning_rate": 8.697336749358687e-10, + "loss": 0.0033, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6496 + }, + { + "completion_length": 1897.0000610351562, + "epoch": 0.9903963414634146, + "grad_norm": 0.05652376325752781, + "kl": 0.035888671875, + "learning_rate": 8.427693604085018e-10, + "loss": 0.0014, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6497 + }, + { + "completion_length": 1564.6667175292969, + "epoch": 0.9905487804878049, + "grad_norm": 0.07264686921953815, + "kl": 0.0594482421875, + "learning_rate": 8.162295229376393e-10, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6498 + }, + { + "completion_length": 2307.8333435058594, + "epoch": 0.9907012195121951, + "grad_norm": 0.1325760453311061, + "kl": 0.0704345703125, + "learning_rate": 7.901141700381032e-10, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6499 + }, + { + "completion_length": 1549.5000915527344, + "epoch": 0.9908536585365854, + "grad_norm": 0.08088997447342602, + "kl": 0.061279296875, + "learning_rate": 7.644233091043118e-10, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6500 + }, + { + "completion_length": 1917.5000610351562, + "epoch": 0.9910060975609756, + "grad_norm": 0.05518078506954226, + "kl": 0.052734375, + "learning_rate": 7.391569474104465e-10, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6501 + }, + { + "completion_length": 719.8333740234375, + "epoch": 0.9911585365853659, + "grad_norm": 0.1309024748799736, + "kl": 0.05126953125, + "learning_rate": 7.143150921104514e-10, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6502 + }, + { + "completion_length": 1502.666748046875, + "epoch": 0.9913109756097561, + "grad_norm": 0.4135035076613005, + "kl": 0.066162109375, + "learning_rate": 6.898977502381998e-10, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6503 + }, + { + "completion_length": 1396.666748046875, + "epoch": 0.9914634146341463, + "grad_norm": 0.08051241250653643, + "kl": 0.044189453125, + "learning_rate": 6.659049287071617e-10, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6504 + }, + { + "completion_length": 1089.0000610351562, + "epoch": 0.9916158536585366, + "grad_norm": 0.2111564902984234, + "kl": 0.068115234375, + "learning_rate": 6.423366343110693e-10, + "loss": 0.0027, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6505 + }, + { + "completion_length": 3032.6666870117188, + "epoch": 0.9917682926829269, + "grad_norm": 0.044374072458871296, + "kl": 0.04150390625, + "learning_rate": 6.19192873722918e-10, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6506 + }, + { + "completion_length": 1411.8333740234375, + "epoch": 0.9919207317073171, + "grad_norm": 0.08098371968387784, + "kl": 0.0469970703125, + "learning_rate": 5.964736534956327e-10, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6507 + }, + { + "completion_length": 938.6666870117188, + "epoch": 0.9920731707317073, + "grad_norm": 0.09363907437122716, + "kl": 0.04736328125, + "learning_rate": 5.741789800622344e-10, + "loss": 0.0019, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6508 + }, + { + "completion_length": 764.0000305175781, + "epoch": 0.9922256097560975, + "grad_norm": 2.125610556709717, + "kl": 0.0693359375, + "learning_rate": 5.523088597351733e-10, + "loss": 0.0028, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6509 + }, + { + "completion_length": 1628.3334350585938, + "epoch": 0.9923780487804879, + "grad_norm": 0.09442959708154744, + "kl": 0.0528564453125, + "learning_rate": 5.308632987069961e-10, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6510 + }, + { + "completion_length": 1177.6666870117188, + "epoch": 0.9925304878048781, + "grad_norm": 0.06944734580631635, + "kl": 0.0443115234375, + "learning_rate": 5.09842303049679e-10, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6511 + }, + { + "completion_length": 1855.8334350585938, + "epoch": 0.9926829268292683, + "grad_norm": 0.09301578660023961, + "kl": 0.069091796875, + "learning_rate": 4.892458787154608e-10, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6512 + }, + { + "completion_length": 1371.0000305175781, + "epoch": 0.9928353658536585, + "grad_norm": 0.09980426567793464, + "kl": 0.0574951171875, + "learning_rate": 4.690740315356767e-10, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6513 + }, + { + "completion_length": 1477.5000610351562, + "epoch": 0.9929878048780488, + "grad_norm": 0.07463999656034398, + "kl": 0.04931640625, + "learning_rate": 4.4932676722225784e-10, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6514 + }, + { + "completion_length": 1264.8333435058594, + "epoch": 0.993140243902439, + "grad_norm": 1.6021093674753895, + "kl": 0.0631103515625, + "learning_rate": 4.3000409136623176e-10, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6515 + }, + { + "completion_length": 2348.3333740234375, + "epoch": 0.9932926829268293, + "grad_norm": 0.13598712734542973, + "kl": 0.0574951171875, + "learning_rate": 4.1110600943905507e-10, + "loss": 0.0023, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6516 + }, + { + "completion_length": 2274.1666870117188, + "epoch": 0.9934451219512195, + "grad_norm": 1.3874132891871678, + "kl": 0.0550537109375, + "learning_rate": 3.926325267911146e-10, + "loss": 0.0022, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6517 + }, + { + "completion_length": 1260.1666870117188, + "epoch": 0.9935975609756098, + "grad_norm": 1.4463435136206853, + "kl": 0.062255859375, + "learning_rate": 3.7458364865355923e-10, + "loss": 0.0025, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6518 + }, + { + "completion_length": 1034.0000610351562, + "epoch": 0.99375, + "grad_norm": 0.09299422695090347, + "kl": 0.053466796875, + "learning_rate": 3.569593801363014e-10, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6519 + }, + { + "completion_length": 1470.0000610351562, + "epoch": 0.9939024390243902, + "grad_norm": 2.853462438269486, + "kl": 0.0582275390625, + "learning_rate": 3.397597262300156e-10, + "loss": 0.0023, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6520 + }, + { + "completion_length": 1324.8333435058594, + "epoch": 0.9940548780487805, + "grad_norm": 1.0708959004636862, + "kl": 0.0489501953125, + "learning_rate": 3.229846918044732e-10, + "loss": 0.002, + "reward": 0.3333333432674408, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.3333333432674408, + "rewards/format_reward": 0.0, + "step": 6521 + }, + { + "completion_length": 1836.5, + "epoch": 0.9942073170731708, + "grad_norm": 0.13868224176690636, + "kl": 0.0546875, + "learning_rate": 3.066342816093748e-10, + "loss": 0.0022, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6522 + }, + { + "completion_length": 1129.1667175292969, + "epoch": 0.994359756097561, + "grad_norm": 0.1983682239069798, + "kl": 0.074462890625, + "learning_rate": 2.907085002743504e-10, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6523 + }, + { + "completion_length": 1625.166748046875, + "epoch": 0.9945121951219512, + "grad_norm": 0.08708145866347228, + "kl": 0.0450439453125, + "learning_rate": 2.7520735230845973e-10, + "loss": 0.0018, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6524 + }, + { + "completion_length": 1326.8333740234375, + "epoch": 0.9946646341463414, + "grad_norm": 0.10547245714420804, + "kl": 0.0430908203125, + "learning_rate": 2.6013084210102514e-10, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6525 + }, + { + "completion_length": 1515.1666870117188, + "epoch": 0.9948170731707318, + "grad_norm": 0.11751804084176255, + "kl": 0.073974609375, + "learning_rate": 2.4547897392079855e-10, + "loss": 0.003, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6526 + }, + { + "completion_length": 1786.8333740234375, + "epoch": 0.994969512195122, + "grad_norm": 0.07817620936107873, + "kl": 0.062744140625, + "learning_rate": 2.3125175191629489e-10, + "loss": 0.0025, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6527 + }, + { + "completion_length": 1373.6666870117188, + "epoch": 0.9951219512195122, + "grad_norm": 0.07435762798374973, + "kl": 0.05029296875, + "learning_rate": 2.1744918011595837e-10, + "loss": 0.002, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6528 + }, + { + "completion_length": 961.0000305175781, + "epoch": 0.9952743902439024, + "grad_norm": 0.10054491403435693, + "kl": 0.062744140625, + "learning_rate": 2.040712624278296e-10, + "loss": 0.0025, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6529 + }, + { + "completion_length": 1601.8333740234375, + "epoch": 0.9954268292682927, + "grad_norm": 0.09956348143556945, + "kl": 0.0513916015625, + "learning_rate": 1.9111800263971192e-10, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6530 + }, + { + "completion_length": 1703.8333435058594, + "epoch": 0.995579268292683, + "grad_norm": 0.07209545000244587, + "kl": 0.053955078125, + "learning_rate": 1.7858940441933814e-10, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6531 + }, + { + "completion_length": 560.5000152587891, + "epoch": 0.9957317073170732, + "grad_norm": 0.09982761759127846, + "kl": 0.0692138671875, + "learning_rate": 1.664854713142039e-10, + "loss": 0.0028, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6532 + }, + { + "completion_length": 1122.1666870117188, + "epoch": 0.9958841463414634, + "grad_norm": 0.1295348129083706, + "kl": 0.0599365234375, + "learning_rate": 1.5480620675123458e-10, + "loss": 0.0024, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6533 + }, + { + "completion_length": 1546.8333740234375, + "epoch": 0.9960365853658537, + "grad_norm": 0.09766614084273038, + "kl": 0.067138671875, + "learning_rate": 1.4355161403745153e-10, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6534 + }, + { + "completion_length": 2441.83349609375, + "epoch": 0.9961890243902439, + "grad_norm": 2.777563460578517, + "kl": 0.0714111328125, + "learning_rate": 1.3272169635963893e-10, + "loss": 0.0029, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6535 + }, + { + "completion_length": 1852.166748046875, + "epoch": 0.9963414634146341, + "grad_norm": 0.05531339365272541, + "kl": 0.039794921875, + "learning_rate": 1.2231645678401072e-10, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6536 + }, + { + "completion_length": 1455.3333740234375, + "epoch": 0.9964939024390244, + "grad_norm": 0.06738520154458698, + "kl": 0.0396728515625, + "learning_rate": 1.1233589825687673e-10, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6537 + }, + { + "completion_length": 697.0, + "epoch": 0.9966463414634147, + "grad_norm": 1.7874210664798458, + "kl": 0.0589599609375, + "learning_rate": 1.0278002360414319e-10, + "loss": 0.0024, + "reward": 0.1666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.1666666716337204, + "rewards/format_reward": 0.0, + "step": 6538 + }, + { + "completion_length": 1006.6666870117188, + "epoch": 0.9967987804878049, + "grad_norm": 0.0996360719871746, + "kl": 0.0513916015625, + "learning_rate": 9.364883553147907e-11, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6539 + }, + { + "completion_length": 3154.166748046875, + "epoch": 0.9969512195121951, + "grad_norm": 0.04897888840483542, + "kl": 0.04541015625, + "learning_rate": 8.494233662431627e-11, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6540 + }, + { + "completion_length": 1160.6666870117188, + "epoch": 0.9971036585365853, + "grad_norm": 0.3041452503220598, + "kl": 0.0533447265625, + "learning_rate": 7.66605293478495e-11, + "loss": 0.0021, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6541 + }, + { + "completion_length": 2270.3333740234375, + "epoch": 0.9972560975609757, + "grad_norm": 0.06783149861338193, + "kl": 0.04217529296875, + "learning_rate": 6.880341604720286e-11, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6542 + }, + { + "completion_length": 780.6666870117188, + "epoch": 0.9974085365853659, + "grad_norm": 0.16121781713474828, + "kl": 0.041748046875, + "learning_rate": 6.137099894676368e-11, + "loss": 0.0017, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6543 + }, + { + "completion_length": 1375.5000305175781, + "epoch": 0.9975609756097561, + "grad_norm": 0.09415720215837747, + "kl": 0.0650634765625, + "learning_rate": 5.436328015101522e-11, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6544 + }, + { + "completion_length": 1947.0000915527344, + "epoch": 0.9977134146341463, + "grad_norm": 0.07504468541314853, + "kl": 0.040771484375, + "learning_rate": 4.77802616443701e-11, + "loss": 0.0016, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6545 + }, + { + "completion_length": 999.0000305175781, + "epoch": 0.9978658536585366, + "grad_norm": 0.09019279882728487, + "kl": 0.0537109375, + "learning_rate": 4.162194529067076e-11, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6546 + }, + { + "completion_length": 782.6666870117188, + "epoch": 0.9980182926829269, + "grad_norm": 0.14719260029432343, + "kl": 0.082763671875, + "learning_rate": 3.588833283352244e-11, + "loss": 0.0033, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6547 + }, + { + "completion_length": 2132.3333435058594, + "epoch": 0.9981707317073171, + "grad_norm": 0.1282055065449003, + "kl": 0.044921875, + "learning_rate": 3.057942589645979e-11, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6548 + }, + { + "completion_length": 1017.1666870117188, + "epoch": 0.9983231707317073, + "grad_norm": 0.09098239970410599, + "kl": 0.0516357421875, + "learning_rate": 2.5695225982613758e-11, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6549 + }, + { + "completion_length": 1959.8333435058594, + "epoch": 0.9984756097560976, + "grad_norm": 0.07093051277365835, + "kl": 0.040283203125, + "learning_rate": 2.1235734474878143e-11, + "loss": 0.0016, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6550 + }, + { + "completion_length": 1574.0, + "epoch": 0.9986280487804878, + "grad_norm": 0.09089089724330746, + "kl": 0.052978515625, + "learning_rate": 1.720095263607613e-11, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6551 + }, + { + "completion_length": 1925.5000915527344, + "epoch": 0.998780487804878, + "grad_norm": 0.1203612527615984, + "kl": 0.05615234375, + "learning_rate": 1.359088160846067e-11, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6552 + }, + { + "completion_length": 1076.5000610351562, + "epoch": 0.9989329268292683, + "grad_norm": 0.08826063282884802, + "kl": 0.0518798828125, + "learning_rate": 1.0405522414380641e-11, + "loss": 0.0021, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6553 + }, + { + "completion_length": 1966.3334350585938, + "epoch": 0.9990853658536586, + "grad_norm": 0.07414102217560124, + "kl": 0.0555419921875, + "learning_rate": 7.644875955448161e-12, + "loss": 0.0022, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6554 + }, + { + "completion_length": 1606.6666870117188, + "epoch": 0.9992378048780488, + "grad_norm": 0.90586816414353, + "kl": 0.060546875, + "learning_rate": 5.308943013704326e-12, + "loss": 0.0024, + "reward": 0.6666666716337204, + "reward_std": 0.28867512941360474, + "rewards/accuracy_reward": 0.6666666716337204, + "rewards/format_reward": 0.0, + "step": 6555 + }, + { + "completion_length": 1759.5001220703125, + "epoch": 0.999390243902439, + "grad_norm": 0.12061231925902986, + "kl": 0.0673828125, + "learning_rate": 3.3977242502869487e-12, + "loss": 0.0027, + "reward": 0.5, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.5, + "rewards/format_reward": 0.0, + "step": 6556 + }, + { + "completion_length": 1145.1666870117188, + "epoch": 0.9995426829268292, + "grad_norm": 0.13136576270740774, + "kl": 0.094482421875, + "learning_rate": 1.9112202064297534e-12, + "loss": 0.0038, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6557 + }, + { + "completion_length": 1748.1666870117188, + "epoch": 0.9996951219512196, + "grad_norm": 0.10324142336744951, + "kl": 0.06591796875, + "learning_rate": 8.494313031293111e-13, + "loss": 0.0026, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6558 + }, + { + "completion_length": 1919.1666870117188, + "epoch": 0.9998475609756098, + "grad_norm": 0.07887244458582071, + "kl": 0.04541015625, + "learning_rate": 2.1235784081197196e-13, + "loss": 0.0018, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6559 + }, + { + "completion_length": 717.5, + "epoch": 1.0, + "grad_norm": 0.19901242754733192, + "kl": 0.082763671875, + "learning_rate": 0.0, + "loss": 0.0028, + "reward": 0.0, + "reward_std": 0.0, + "rewards/accuracy_reward": 0.0, + "rewards/format_reward": 0.0, + "step": 6560 + }, + { + "epoch": 1.0, + "step": 6560, + "total_flos": 0.0, + "train_loss": 0.002648238443211542, + "train_runtime": 433761.625, + "train_samples_per_second": 0.03, + "train_steps_per_second": 0.015 + } + ], + "logging_steps": 1, + "max_steps": 6560, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}