{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 820, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 788.5000305175781, "epoch": 0.0012195121951219512, "grad_norm": 0.3571978509426117, "kl": 0.0, "learning_rate": 3.658536585365854e-08, "loss": 0.0178, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 1 }, { "completion_length": 595.1875305175781, "epoch": 0.0024390243902439024, "grad_norm": 0.3311821520328522, "kl": 0.0, "learning_rate": 7.317073170731708e-08, "loss": -0.006, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 2 }, { "completion_length": 894.6041870117188, "epoch": 0.003658536585365854, "grad_norm": 0.5522251129150391, "kl": 0.00023651123046875, "learning_rate": 1.097560975609756e-07, "loss": -0.0317, "reward": 0.1875000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 3 }, { "completion_length": 913.2083435058594, "epoch": 0.004878048780487805, "grad_norm": 0.2455306351184845, "kl": 0.00029087066650390625, "learning_rate": 1.4634146341463415e-07, "loss": -0.0142, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 4 }, { "completion_length": 645.5625, "epoch": 0.006097560975609756, "grad_norm": 0.20754282176494598, "kl": 0.0003032684326171875, "learning_rate": 1.8292682926829268e-07, "loss": -0.0035, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 5 }, { "completion_length": 825.25, "epoch": 0.007317073170731708, "grad_norm": 0.33052483201026917, "kl": 0.000278472900390625, "learning_rate": 2.195121951219512e-07, "loss": 0.0511, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 6 }, { "completion_length": 646.5208435058594, "epoch": 0.00853658536585366, "grad_norm": 0.6244280934333801, "kl": 0.00029754638671875, "learning_rate": 2.5609756097560976e-07, "loss": -0.0356, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 7 }, { "completion_length": 723.1041870117188, "epoch": 0.00975609756097561, "grad_norm": 0.3806585371494293, "kl": 0.00030422210693359375, "learning_rate": 2.926829268292683e-07, "loss": 0.0408, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 8 }, { "completion_length": 752.7083435058594, "epoch": 0.01097560975609756, "grad_norm": 0.3775721490383148, "kl": 0.00028228759765625, "learning_rate": 3.2926829268292686e-07, "loss": 0.0091, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 9 }, { "completion_length": 880.8541870117188, "epoch": 0.012195121951219513, "grad_norm": 0.16199147701263428, "kl": 0.00026607513427734375, "learning_rate": 3.6585365853658536e-07, "loss": 0.0043, "reward": 0.1666666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 10 }, { "completion_length": 752.1458435058594, "epoch": 0.013414634146341463, "grad_norm": 0.5467624068260193, "kl": 0.000339508056640625, "learning_rate": 4.0243902439024396e-07, "loss": -0.056, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 11 }, { "completion_length": 790.125, "epoch": 0.014634146341463415, "grad_norm": 0.3221971392631531, "kl": 0.0002956390380859375, "learning_rate": 4.390243902439024e-07, "loss": -0.0217, "reward": 0.2500000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 12 }, { "completion_length": 661.7291870117188, "epoch": 0.015853658536585366, "grad_norm": 0.5072605609893799, "kl": 0.00029659271240234375, "learning_rate": 4.75609756097561e-07, "loss": -0.0177, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 13 }, { "completion_length": 620.8750305175781, "epoch": 0.01707317073170732, "grad_norm": 0.32282891869544983, "kl": 0.00041866302490234375, "learning_rate": 5.121951219512195e-07, "loss": 0.0156, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 14 }, { "completion_length": 821.2291870117188, "epoch": 0.018292682926829267, "grad_norm": 0.2993911802768707, "kl": 0.00032138824462890625, "learning_rate": 5.48780487804878e-07, "loss": 0.0265, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 15 }, { "completion_length": 632.0416870117188, "epoch": 0.01951219512195122, "grad_norm": 0.1648959368467331, "kl": 0.000415802001953125, "learning_rate": 5.853658536585366e-07, "loss": -0.0027, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 16 }, { "completion_length": 606.5208435058594, "epoch": 0.020731707317073172, "grad_norm": 0.4805357754230499, "kl": 0.0004749298095703125, "learning_rate": 6.219512195121951e-07, "loss": -0.0221, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 17 }, { "completion_length": 603.0625305175781, "epoch": 0.02195121951219512, "grad_norm": 0.09931223839521408, "kl": 0.0008544921875, "learning_rate": 6.585365853658537e-07, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 18 }, { "completion_length": 764.8125305175781, "epoch": 0.023170731707317073, "grad_norm": 0.02471252717077732, "kl": 0.0004911422729492188, "learning_rate": 6.951219512195122e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 19 }, { "completion_length": 608.6875305175781, "epoch": 0.024390243902439025, "grad_norm": 0.4140380918979645, "kl": 0.0008373260498046875, "learning_rate": 7.317073170731707e-07, "loss": -0.0022, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 20 }, { "completion_length": 810.7292175292969, "epoch": 0.025609756097560974, "grad_norm": 0.43750235438346863, "kl": 0.001323699951171875, "learning_rate": 7.682926829268293e-07, "loss": -0.0126, "reward": 0.229166679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 21 }, { "completion_length": 733.875, "epoch": 0.026829268292682926, "grad_norm": 0.0427839532494545, "kl": 0.0009918212890625, "learning_rate": 8.048780487804879e-07, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 22 }, { "completion_length": 719.0208435058594, "epoch": 0.02804878048780488, "grad_norm": 0.3788954019546509, "kl": 0.005901336669921875, "learning_rate": 8.414634146341464e-07, "loss": -0.0154, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 23 }, { "completion_length": 668.7708435058594, "epoch": 0.02926829268292683, "grad_norm": 0.6176497936248779, "kl": 0.00179290771484375, "learning_rate": 8.780487804878048e-07, "loss": 0.0085, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 24 }, { "completion_length": 859.4166870117188, "epoch": 0.03048780487804878, "grad_norm": 0.34154024720191956, "kl": 0.0024566650390625, "learning_rate": 9.146341463414634e-07, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 25 }, { "completion_length": 600.7708435058594, "epoch": 0.03170731707317073, "grad_norm": 0.6522072553634644, "kl": 0.005767822265625, "learning_rate": 9.51219512195122e-07, "loss": -0.0238, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 26 }, { "completion_length": 773.3541870117188, "epoch": 0.032926829268292684, "grad_norm": 0.07703638821840286, "kl": 0.00222015380859375, "learning_rate": 9.878048780487806e-07, "loss": 0.0001, "reward": 0.3125, "reward_std": 0.0, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 27 }, { "completion_length": 944.4583435058594, "epoch": 0.03414634146341464, "grad_norm": 0.37970709800720215, "kl": 0.001628875732421875, "learning_rate": 1.024390243902439e-06, "loss": -0.0156, "reward": 0.291666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "step": 28 }, { "completion_length": 838.8333435058594, "epoch": 0.03536585365853658, "grad_norm": 0.06090879812836647, "kl": 0.001903533935546875, "learning_rate": 1.0609756097560976e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 29 }, { "completion_length": 771.0833435058594, "epoch": 0.036585365853658534, "grad_norm": 0.3602464199066162, "kl": 0.00479888916015625, "learning_rate": 1.097560975609756e-06, "loss": 0.0039, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 30 }, { "completion_length": 664.7500305175781, "epoch": 0.03780487804878049, "grad_norm": 0.4147832691669464, "kl": 0.0021514892578125, "learning_rate": 1.1341463414634146e-06, "loss": 0.0062, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 31 }, { "completion_length": 860.0208740234375, "epoch": 0.03902439024390244, "grad_norm": 0.03816133737564087, "kl": 0.00128936767578125, "learning_rate": 1.1707317073170732e-06, "loss": 0.0, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 32 }, { "completion_length": 725.3125, "epoch": 0.04024390243902439, "grad_norm": 0.0702565535902977, "kl": 0.002208709716796875, "learning_rate": 1.2073170731707318e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 33 }, { "completion_length": 651.3958435058594, "epoch": 0.041463414634146344, "grad_norm": 0.04627303034067154, "kl": 0.001392364501953125, "learning_rate": 1.2439024390243902e-06, "loss": 0.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 34 }, { "completion_length": 865.2291870117188, "epoch": 0.042682926829268296, "grad_norm": 0.35550418496131897, "kl": 0.002017974853515625, "learning_rate": 1.2804878048780488e-06, "loss": -0.0023, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 35 }, { "completion_length": 950.8333435058594, "epoch": 0.04390243902439024, "grad_norm": 0.3738349974155426, "kl": 0.001232147216796875, "learning_rate": 1.3170731707317074e-06, "loss": -0.0004, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 36 }, { "completion_length": 663.4375305175781, "epoch": 0.045121951219512194, "grad_norm": 0.5210116505622864, "kl": 0.002288818359375, "learning_rate": 1.3536585365853658e-06, "loss": 0.0308, "reward": 0.229166679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 37 }, { "completion_length": 690.5, "epoch": 0.046341463414634146, "grad_norm": 0.39043503999710083, "kl": 0.009716033935546875, "learning_rate": 1.3902439024390244e-06, "loss": 0.0009, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 38 }, { "completion_length": 679.2916870117188, "epoch": 0.0475609756097561, "grad_norm": 0.20443572103977203, "kl": 0.00389862060546875, "learning_rate": 1.4268292682926828e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 39 }, { "completion_length": 637.7708740234375, "epoch": 0.04878048780487805, "grad_norm": 0.3915785551071167, "kl": 0.001926422119140625, "learning_rate": 1.4634146341463414e-06, "loss": 0.0156, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 40 }, { "completion_length": 742.4166870117188, "epoch": 0.05, "grad_norm": 0.4709751009941101, "kl": 0.002841949462890625, "learning_rate": 1.5e-06, "loss": 0.0118, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 41 }, { "completion_length": 794.6666870117188, "epoch": 0.05121951219512195, "grad_norm": 0.09043405950069427, "kl": 0.001739501953125, "learning_rate": 1.5365853658536586e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 42 }, { "completion_length": 653.7708740234375, "epoch": 0.0524390243902439, "grad_norm": 0.4868049919605255, "kl": 0.001434326171875, "learning_rate": 1.5731707317073172e-06, "loss": 0.0249, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 43 }, { "completion_length": 551.6458435058594, "epoch": 0.05365853658536585, "grad_norm": 0.3079073429107666, "kl": 0.00128173828125, "learning_rate": 1.6097560975609759e-06, "loss": 0.0083, "reward": 0.2916666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 44 }, { "completion_length": 794.8333435058594, "epoch": 0.054878048780487805, "grad_norm": 0.5330808758735657, "kl": 0.000972747802734375, "learning_rate": 1.6463414634146342e-06, "loss": -0.0414, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 45 }, { "completion_length": 604.5833435058594, "epoch": 0.05609756097560976, "grad_norm": 0.5505648851394653, "kl": 0.00283050537109375, "learning_rate": 1.6829268292682928e-06, "loss": -0.0118, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 46 }, { "completion_length": 541.2291717529297, "epoch": 0.05731707317073171, "grad_norm": 0.6158074736595154, "kl": 0.00244903564453125, "learning_rate": 1.719512195121951e-06, "loss": 0.0012, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 47 }, { "completion_length": 671.0416870117188, "epoch": 0.05853658536585366, "grad_norm": 0.1726604402065277, "kl": 0.005340576171875, "learning_rate": 1.7560975609756096e-06, "loss": 0.0001, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 48 }, { "completion_length": 757.9166870117188, "epoch": 0.05975609756097561, "grad_norm": 0.08531015366315842, "kl": 0.002162933349609375, "learning_rate": 1.7926829268292682e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 49 }, { "completion_length": 729.4583740234375, "epoch": 0.06097560975609756, "grad_norm": 0.42457133531570435, "kl": 0.0019073486328125, "learning_rate": 1.8292682926829268e-06, "loss": -0.033, "reward": 0.1458333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 50 }, { "completion_length": 641.375, "epoch": 0.06219512195121951, "grad_norm": 0.04091706499457359, "kl": 0.002422332763671875, "learning_rate": 1.8658536585365854e-06, "loss": 0.0001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 51 }, { "completion_length": 760.2708740234375, "epoch": 0.06341463414634146, "grad_norm": 0.3898300528526306, "kl": 0.00279998779296875, "learning_rate": 1.902439024390244e-06, "loss": -0.0003, "reward": 0.2500000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 52 }, { "completion_length": 744.9375305175781, "epoch": 0.06463414634146342, "grad_norm": 0.09664002805948257, "kl": 0.0029296875, "learning_rate": 1.9390243902439024e-06, "loss": 0.0018, "reward": 0.3333333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 53 }, { "completion_length": 771.5625, "epoch": 0.06585365853658537, "grad_norm": 0.644061803817749, "kl": 0.00276947021484375, "learning_rate": 1.9756097560975613e-06, "loss": -0.0595, "reward": 0.2500000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 54 }, { "completion_length": 714.1041870117188, "epoch": 0.06707317073170732, "grad_norm": 0.4097103178501129, "kl": 0.00342559814453125, "learning_rate": 2.0121951219512197e-06, "loss": 0.0054, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 55 }, { "completion_length": 667.5625305175781, "epoch": 0.06829268292682927, "grad_norm": 0.690647304058075, "kl": 0.0050201416015625, "learning_rate": 2.048780487804878e-06, "loss": 0.0823, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 56 }, { "completion_length": 631.9375, "epoch": 0.06951219512195123, "grad_norm": 0.21440348029136658, "kl": 0.0043792724609375, "learning_rate": 2.0853658536585364e-06, "loss": 0.0048, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 57 }, { "completion_length": 617.8958435058594, "epoch": 0.07073170731707316, "grad_norm": 0.363092303276062, "kl": 0.005340576171875, "learning_rate": 2.1219512195121953e-06, "loss": 0.0056, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 58 }, { "completion_length": 794.4375, "epoch": 0.07195121951219512, "grad_norm": 0.46107953786849976, "kl": 0.008514404296875, "learning_rate": 2.1585365853658537e-06, "loss": 0.012, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 59 }, { "completion_length": 769.2083740234375, "epoch": 0.07317073170731707, "grad_norm": 0.6187219023704529, "kl": 0.0089111328125, "learning_rate": 2.195121951219512e-06, "loss": 0.0335, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 60 }, { "completion_length": 766.9791870117188, "epoch": 0.07439024390243902, "grad_norm": 0.5316298007965088, "kl": 0.005126953125, "learning_rate": 2.231707317073171e-06, "loss": -0.0355, "reward": 0.2083333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 61 }, { "completion_length": 622.1041870117188, "epoch": 0.07560975609756097, "grad_norm": 0.6351970434188843, "kl": 0.0066375732421875, "learning_rate": 2.2682926829268293e-06, "loss": 0.0001, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 62 }, { "completion_length": 640.4791870117188, "epoch": 0.07682926829268293, "grad_norm": 0.4834135174751282, "kl": 0.0060577392578125, "learning_rate": 2.304878048780488e-06, "loss": -0.0195, "reward": 0.3125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 63 }, { "completion_length": 700.7708435058594, "epoch": 0.07804878048780488, "grad_norm": 0.34847137331962585, "kl": 0.00543975830078125, "learning_rate": 2.3414634146341465e-06, "loss": 0.0004, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 64 }, { "completion_length": 704.4583435058594, "epoch": 0.07926829268292683, "grad_norm": 0.5386676788330078, "kl": 0.005706787109375, "learning_rate": 2.378048780487805e-06, "loss": -0.0559, "reward": 0.1458333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 65 }, { "completion_length": 825.2083740234375, "epoch": 0.08048780487804878, "grad_norm": 0.6664050817489624, "kl": 0.0067596435546875, "learning_rate": 2.4146341463414637e-06, "loss": 0.0535, "reward": 0.1458333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 66 }, { "completion_length": 743.7916717529297, "epoch": 0.08170731707317073, "grad_norm": 0.6769405603408813, "kl": 0.0077056884765625, "learning_rate": 2.451219512195122e-06, "loss": 0.0357, "reward": 0.2083333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 67 }, { "completion_length": 681.4166870117188, "epoch": 0.08292682926829269, "grad_norm": 0.11391153931617737, "kl": 0.005401611328125, "learning_rate": 2.4878048780487805e-06, "loss": 0.0002, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 68 }, { "completion_length": 906.8125305175781, "epoch": 0.08414634146341464, "grad_norm": 0.48767825961112976, "kl": 0.01153564453125, "learning_rate": 2.524390243902439e-06, "loss": -0.0238, "reward": 0.4166666865348816, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "step": 69 }, { "completion_length": 617.2916717529297, "epoch": 0.08536585365853659, "grad_norm": 0.8901994228363037, "kl": 0.015533447265625, "learning_rate": 2.5609756097560977e-06, "loss": -0.0081, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 70 }, { "completion_length": 775.3541870117188, "epoch": 0.08658536585365853, "grad_norm": 0.5658962726593018, "kl": 0.014251708984375, "learning_rate": 2.597560975609756e-06, "loss": 0.0227, "reward": 0.3125, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 71 }, { "completion_length": 717.9375305175781, "epoch": 0.08780487804878048, "grad_norm": 0.5440481305122375, "kl": 0.009857177734375, "learning_rate": 2.634146341463415e-06, "loss": -0.0534, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 72 }, { "completion_length": 718.6458740234375, "epoch": 0.08902439024390243, "grad_norm": 0.6130486130714417, "kl": 0.012054443359375, "learning_rate": 2.6707317073170733e-06, "loss": 0.0151, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 73 }, { "completion_length": 813.5625305175781, "epoch": 0.09024390243902439, "grad_norm": 0.11815643310546875, "kl": 0.0111541748046875, "learning_rate": 2.7073170731707317e-06, "loss": 0.0004, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 74 }, { "completion_length": 825.8541870117188, "epoch": 0.09146341463414634, "grad_norm": 395.2074279785156, "kl": 4.0714111328125, "learning_rate": 2.7439024390243905e-06, "loss": 0.1267, "reward": 0.2500000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 75 }, { "completion_length": 799.9583435058594, "epoch": 0.09268292682926829, "grad_norm": 0.4025568664073944, "kl": 0.013458251953125, "learning_rate": 2.780487804878049e-06, "loss": -0.0142, "reward": 0.3333333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 76 }, { "completion_length": 971.3333435058594, "epoch": 0.09390243902439024, "grad_norm": 0.5087530016899109, "kl": 0.016357421875, "learning_rate": 2.8170731707317073e-06, "loss": -0.0119, "reward": 0.10416666977107525, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 77 }, { "completion_length": 587.6041870117188, "epoch": 0.0951219512195122, "grad_norm": 0.38013386726379395, "kl": 0.01312255859375, "learning_rate": 2.8536585365853657e-06, "loss": 0.0125, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 78 }, { "completion_length": 659.6041870117188, "epoch": 0.09634146341463415, "grad_norm": 0.058323778212070465, "kl": 0.013397216796875, "learning_rate": 2.8902439024390245e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 79 }, { "completion_length": 680.1041870117188, "epoch": 0.0975609756097561, "grad_norm": 0.39666855335235596, "kl": 0.01202392578125, "learning_rate": 2.926829268292683e-06, "loss": 0.0016, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 80 }, { "completion_length": 898.7708740234375, "epoch": 0.09878048780487805, "grad_norm": 0.5175566673278809, "kl": 0.01434326171875, "learning_rate": 2.9634146341463417e-06, "loss": 0.018, "reward": 0.14583333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 81 }, { "completion_length": 584.2083435058594, "epoch": 0.1, "grad_norm": 0.25543463230133057, "kl": 0.01416015625, "learning_rate": 3e-06, "loss": 0.0109, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 82 }, { "completion_length": 652.0416870117188, "epoch": 0.10121951219512196, "grad_norm": 0.5867159962654114, "kl": 0.0198974609375, "learning_rate": 2.9999864091183917e-06, "loss": -0.0393, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 83 }, { "completion_length": 657.2708740234375, "epoch": 0.1024390243902439, "grad_norm": 0.5001187324523926, "kl": 0.014007568359375, "learning_rate": 2.999945636719849e-06, "loss": 0.0109, "reward": 0.1875000111758709, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 84 }, { "completion_length": 672.2916870117188, "epoch": 0.10365853658536585, "grad_norm": 0.2978525757789612, "kl": 0.014129638671875, "learning_rate": 2.999877683543216e-06, "loss": -0.017, "reward": 0.1041666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 85 }, { "completion_length": 654.8958435058594, "epoch": 0.1048780487804878, "grad_norm": 0.4366808533668518, "kl": 0.009307861328125, "learning_rate": 2.999782550819884e-06, "loss": -0.0144, "reward": 0.3333333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 86 }, { "completion_length": 725.7083435058594, "epoch": 0.10609756097560975, "grad_norm": 0.332344651222229, "kl": 0.012420654296875, "learning_rate": 2.99966024027377e-06, "loss": 0.0065, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 87 }, { "completion_length": 814.4583435058594, "epoch": 0.1073170731707317, "grad_norm": 0.4384961724281311, "kl": 0.010833740234375, "learning_rate": 2.9995107541212846e-06, "loss": -0.0281, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 88 }, { "completion_length": 704.3125305175781, "epoch": 0.10853658536585366, "grad_norm": 0.32473617792129517, "kl": 0.011474609375, "learning_rate": 2.999334095071293e-06, "loss": 0.0134, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 89 }, { "completion_length": 691.3958435058594, "epoch": 0.10975609756097561, "grad_norm": 0.4972739517688751, "kl": 0.012054443359375, "learning_rate": 2.9991302663250642e-06, "loss": 0.0078, "reward": 0.2500000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 90 }, { "completion_length": 602.7916870117188, "epoch": 0.11097560975609756, "grad_norm": 0.06607077270746231, "kl": 0.01300048828125, "learning_rate": 2.9988992715762147e-06, "loss": 0.0005, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 91 }, { "completion_length": 793.6666870117188, "epoch": 0.11219512195121951, "grad_norm": 0.38537999987602234, "kl": 0.013641357421875, "learning_rate": 2.9986411150106423e-06, "loss": 0.021, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 92 }, { "completion_length": 774.5000305175781, "epoch": 0.11341463414634147, "grad_norm": 0.3016974925994873, "kl": 0.013336181640625, "learning_rate": 2.9983558013064455e-06, "loss": -0.0093, "reward": 0.2708333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 93 }, { "completion_length": 645.1875305175781, "epoch": 0.11463414634146342, "grad_norm": 0.5931347012519836, "kl": 0.01019287109375, "learning_rate": 2.998043335633845e-06, "loss": 0.0087, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 94 }, { "completion_length": 724.2916870117188, "epoch": 0.11585365853658537, "grad_norm": 0.2517394721508026, "kl": 0.015899658203125, "learning_rate": 2.997703723655086e-06, "loss": 0.0087, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 95 }, { "completion_length": 671.7083740234375, "epoch": 0.11707317073170732, "grad_norm": 0.12199469655752182, "kl": 0.014068603515625, "learning_rate": 2.9973369715243363e-06, "loss": 0.0005, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 96 }, { "completion_length": 665.0833435058594, "epoch": 0.11829268292682926, "grad_norm": 0.4756318926811218, "kl": 0.0115966796875, "learning_rate": 2.996943085887577e-06, "loss": -0.003, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 97 }, { "completion_length": 662.1250305175781, "epoch": 0.11951219512195121, "grad_norm": 0.3721674680709839, "kl": 0.01690673828125, "learning_rate": 2.996522073882477e-06, "loss": -0.0076, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 98 }, { "completion_length": 706.9791870117188, "epoch": 0.12073170731707317, "grad_norm": 0.4329390525817871, "kl": 0.011962890625, "learning_rate": 2.9960739431382697e-06, "loss": -0.0022, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 99 }, { "completion_length": 675.2708435058594, "epoch": 0.12195121951219512, "grad_norm": 0.08323477953672409, "kl": 0.01898193359375, "learning_rate": 2.9955987017756107e-06, "loss": 0.0007, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 100 }, { "completion_length": 774.4166870117188, "epoch": 0.12317073170731707, "grad_norm": 0.3017697334289551, "kl": 0.014556884765625, "learning_rate": 2.9950963584064327e-06, "loss": -0.0116, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 101 }, { "completion_length": 690.4791870117188, "epoch": 0.12439024390243902, "grad_norm": 10.445072174072266, "kl": 0.10589599609375, "learning_rate": 2.9945669221337873e-06, "loss": -0.023, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 102 }, { "completion_length": 690.6666870117188, "epoch": 0.12560975609756098, "grad_norm": 0.24977770447731018, "kl": 0.016754150390625, "learning_rate": 2.994010402551682e-06, "loss": 0.0083, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 103 }, { "completion_length": 768.1458740234375, "epoch": 0.12682926829268293, "grad_norm": 0.5401036739349365, "kl": 0.0157470703125, "learning_rate": 2.9934268097449068e-06, "loss": -0.0023, "reward": 0.2083333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 104 }, { "completion_length": 716.8958435058594, "epoch": 0.12804878048780488, "grad_norm": 0.3317832350730896, "kl": 0.01983642578125, "learning_rate": 2.9928161542888487e-06, "loss": 0.0046, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 105 }, { "completion_length": 739.6458435058594, "epoch": 0.12926829268292683, "grad_norm": 0.2833709120750427, "kl": 0.0157470703125, "learning_rate": 2.9921784472493023e-06, "loss": 0.0306, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 106 }, { "completion_length": 685.7083435058594, "epoch": 0.13048780487804879, "grad_norm": 0.49674850702285767, "kl": 0.02630615234375, "learning_rate": 2.9915137001822686e-06, "loss": -0.0083, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 107 }, { "completion_length": 962.9166870117188, "epoch": 0.13170731707317074, "grad_norm": 0.23559360206127167, "kl": 0.01373291015625, "learning_rate": 2.9908219251337465e-06, "loss": 0.0306, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 108 }, { "completion_length": 756.2500305175781, "epoch": 0.1329268292682927, "grad_norm": 0.49854040145874023, "kl": 0.01385498046875, "learning_rate": 2.9901031346395125e-06, "loss": -0.0227, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 109 }, { "completion_length": 880.1458740234375, "epoch": 0.13414634146341464, "grad_norm": 0.35081374645233154, "kl": 0.01531982421875, "learning_rate": 2.9893573417248957e-06, "loss": 0.0152, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 110 }, { "completion_length": 783.8541870117188, "epoch": 0.1353658536585366, "grad_norm": 0.32845669984817505, "kl": 0.0166015625, "learning_rate": 2.98858455990454e-06, "loss": 0.0296, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 111 }, { "completion_length": 924.6666870117188, "epoch": 0.13658536585365855, "grad_norm": 0.8032549619674683, "kl": 0.05633544921875, "learning_rate": 2.987784803182161e-06, "loss": 0.0036, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 112 }, { "completion_length": 793.3958435058594, "epoch": 0.1378048780487805, "grad_norm": 0.03511551022529602, "kl": 0.013946533203125, "learning_rate": 2.9869580860502894e-06, "loss": 0.0005, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 113 }, { "completion_length": 724.6666870117188, "epoch": 0.13902439024390245, "grad_norm": 0.04026523232460022, "kl": 0.013092041015625, "learning_rate": 2.9861044234900125e-06, "loss": 0.0005, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 114 }, { "completion_length": 1096.5625610351562, "epoch": 0.1402439024390244, "grad_norm": 0.046590324491262436, "kl": 0.014556884765625, "learning_rate": 2.985223830970699e-06, "loss": 0.0005, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 115 }, { "completion_length": 756.1458740234375, "epoch": 0.14146341463414633, "grad_norm": 1.1626088619232178, "kl": 0.079345703125, "learning_rate": 2.98431632444972e-06, "loss": -0.0112, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 116 }, { "completion_length": 537.8125, "epoch": 0.14268292682926828, "grad_norm": 0.26080095767974854, "kl": 0.019287109375, "learning_rate": 2.9833819203721614e-06, "loss": 0.0128, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 117 }, { "completion_length": 636.1041870117188, "epoch": 0.14390243902439023, "grad_norm": 0.4711505174636841, "kl": 0.0189208984375, "learning_rate": 2.982420635670523e-06, "loss": 0.0116, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 118 }, { "completion_length": 633.5625305175781, "epoch": 0.14512195121951219, "grad_norm": 0.05762294679880142, "kl": 0.015533447265625, "learning_rate": 2.981432487764413e-06, "loss": 0.0006, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 119 }, { "completion_length": 568.2291870117188, "epoch": 0.14634146341463414, "grad_norm": 0.5497531890869141, "kl": 0.01641845703125, "learning_rate": 2.980417494560234e-06, "loss": 0.0081, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 120 }, { "completion_length": 732.0416870117188, "epoch": 0.1475609756097561, "grad_norm": 0.5708346366882324, "kl": 0.015960693359375, "learning_rate": 2.979375674450855e-06, "loss": -0.0526, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 121 }, { "completion_length": 672.0208435058594, "epoch": 0.14878048780487804, "grad_norm": 0.21933433413505554, "kl": 0.01849365234375, "learning_rate": 2.9783070463152816e-06, "loss": 0.008, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 122 }, { "completion_length": 673.6875, "epoch": 0.15, "grad_norm": 0.5358403325080872, "kl": 0.02154541015625, "learning_rate": 2.9772116295183124e-06, "loss": -0.0399, "reward": 0.1875000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 123 }, { "completion_length": 678.8333740234375, "epoch": 0.15121951219512195, "grad_norm": 0.45023179054260254, "kl": 0.022705078125, "learning_rate": 2.9760894439101857e-06, "loss": 0.0313, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 124 }, { "completion_length": 702.5000305175781, "epoch": 0.1524390243902439, "grad_norm": 0.20217838883399963, "kl": 0.013641357421875, "learning_rate": 2.974940509826225e-06, "loss": 0.0027, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 125 }, { "completion_length": 633.3541870117188, "epoch": 0.15365853658536585, "grad_norm": 0.37631967663764954, "kl": 0.02264404296875, "learning_rate": 2.973764848086466e-06, "loss": -0.0185, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 126 }, { "completion_length": 554.8750152587891, "epoch": 0.1548780487804878, "grad_norm": 0.3253299593925476, "kl": 0.02276611328125, "learning_rate": 2.9725624799952824e-06, "loss": -0.0038, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 127 }, { "completion_length": 584.6458587646484, "epoch": 0.15609756097560976, "grad_norm": 0.39743635058403015, "kl": 0.02117919921875, "learning_rate": 2.9713334273409965e-06, "loss": 0.0128, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 128 }, { "completion_length": 737.2083435058594, "epoch": 0.1573170731707317, "grad_norm": 0.37635689973831177, "kl": 0.01800537109375, "learning_rate": 2.9700777123954867e-06, "loss": -0.0073, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 129 }, { "completion_length": 751.875, "epoch": 0.15853658536585366, "grad_norm": 0.7148156762123108, "kl": 0.0213623046875, "learning_rate": 2.968795357913784e-06, "loss": 0.0008, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 130 }, { "completion_length": 941.3125305175781, "epoch": 0.1597560975609756, "grad_norm": 0.10271207243204117, "kl": 0.019287109375, "learning_rate": 2.9674863871336603e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 131 }, { "completion_length": 761.4375, "epoch": 0.16097560975609757, "grad_norm": 0.21008461713790894, "kl": 0.0179443359375, "learning_rate": 2.9661508237752034e-06, "loss": 0.0088, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 132 }, { "completion_length": 679.3541870117188, "epoch": 0.16219512195121952, "grad_norm": 0.3089422881603241, "kl": 0.0194091796875, "learning_rate": 2.9647886920403916e-06, "loss": 0.024, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 133 }, { "completion_length": 667.9166870117188, "epoch": 0.16341463414634147, "grad_norm": 0.07238946855068207, "kl": 0.015869140625, "learning_rate": 2.9634000166126534e-06, "loss": 0.0006, "reward": 0.375, "reward_std": 0.0, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 134 }, { "completion_length": 620.2291870117188, "epoch": 0.16463414634146342, "grad_norm": 0.44060084223747253, "kl": 0.02508544921875, "learning_rate": 2.9619848226564196e-06, "loss": -0.0035, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 135 }, { "completion_length": 505.79168701171875, "epoch": 0.16585365853658537, "grad_norm": 0.60687255859375, "kl": 0.0185546875, "learning_rate": 2.9605431358166687e-06, "loss": -0.0126, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 136 }, { "completion_length": 592.6875305175781, "epoch": 0.16707317073170733, "grad_norm": 0.7305315136909485, "kl": 0.0240478515625, "learning_rate": 2.9590749822184602e-06, "loss": -0.0122, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 137 }, { "completion_length": 604.3750152587891, "epoch": 0.16829268292682928, "grad_norm": 0.5852400660514832, "kl": 0.0186767578125, "learning_rate": 2.9575803884664634e-06, "loss": 0.0194, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 138 }, { "completion_length": 675.6666870117188, "epoch": 0.16951219512195123, "grad_norm": 0.2013079971075058, "kl": 0.02276611328125, "learning_rate": 2.9560593816444746e-06, "loss": 0.0004, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 139 }, { "completion_length": 504.37501525878906, "epoch": 0.17073170731707318, "grad_norm": 0.4904243052005768, "kl": 0.0238037109375, "learning_rate": 2.9545119893149243e-06, "loss": -0.0117, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 140 }, { "completion_length": 681.2083435058594, "epoch": 0.1719512195121951, "grad_norm": 0.6175960302352905, "kl": 0.024658203125, "learning_rate": 2.9529382395183812e-06, "loss": -0.0032, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 141 }, { "completion_length": 726.3125, "epoch": 0.17317073170731706, "grad_norm": 0.07112989574670792, "kl": 0.01910400390625, "learning_rate": 2.9513381607730403e-06, "loss": 0.0007, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 142 }, { "completion_length": 650.2083740234375, "epoch": 0.174390243902439, "grad_norm": 0.37218180298805237, "kl": 0.0191650390625, "learning_rate": 2.949711782074211e-06, "loss": 0.0127, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 143 }, { "completion_length": 814.2708435058594, "epoch": 0.17560975609756097, "grad_norm": 0.05150744691491127, "kl": 0.01824951171875, "learning_rate": 2.948059132893786e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 144 }, { "completion_length": 687.2291870117188, "epoch": 0.17682926829268292, "grad_norm": 0.2909289300441742, "kl": 0.02252197265625, "learning_rate": 2.9463802431797115e-06, "loss": 0.0009, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 145 }, { "completion_length": 607.1458740234375, "epoch": 0.17804878048780487, "grad_norm": 0.468717485666275, "kl": 0.0185546875, "learning_rate": 2.9446751433554426e-06, "loss": 0.0035, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 146 }, { "completion_length": 590.0833435058594, "epoch": 0.17926829268292682, "grad_norm": 0.2879053056240082, "kl": 0.01934814453125, "learning_rate": 2.942943864319392e-06, "loss": -0.0179, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 147 }, { "completion_length": 552.7083435058594, "epoch": 0.18048780487804877, "grad_norm": 0.11373342573642731, "kl": 0.019775390625, "learning_rate": 2.941186437444372e-06, "loss": 0.0008, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 148 }, { "completion_length": 507.62501525878906, "epoch": 0.18170731707317073, "grad_norm": 0.4177855849266052, "kl": 0.02447509765625, "learning_rate": 2.939402894577022e-06, "loss": 0.0069, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 149 }, { "completion_length": 425.4166717529297, "epoch": 0.18292682926829268, "grad_norm": 0.3714848458766937, "kl": 0.02252197265625, "learning_rate": 2.9375932680372358e-06, "loss": -0.0108, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 150 }, { "completion_length": 693.5000305175781, "epoch": 0.18414634146341463, "grad_norm": 0.3068605363368988, "kl": 0.0174560546875, "learning_rate": 2.935757590617574e-06, "loss": 0.0115, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 151 }, { "completion_length": 863.25, "epoch": 0.18536585365853658, "grad_norm": 0.07176525145769119, "kl": 0.014923095703125, "learning_rate": 2.9338958955826685e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 152 }, { "completion_length": 561.2916870117188, "epoch": 0.18658536585365854, "grad_norm": 0.051739297807216644, "kl": 0.0205078125, "learning_rate": 2.9320082166686226e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 153 }, { "completion_length": 621.0, "epoch": 0.1878048780487805, "grad_norm": 0.26465901732444763, "kl": 0.0167236328125, "learning_rate": 2.9300945880823955e-06, "loss": -0.0025, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 154 }, { "completion_length": 525.875, "epoch": 0.18902439024390244, "grad_norm": 0.6293399930000305, "kl": 0.0224609375, "learning_rate": 2.928155044501189e-06, "loss": -0.0075, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 155 }, { "completion_length": 561.7916870117188, "epoch": 0.1902439024390244, "grad_norm": 0.41370439529418945, "kl": 0.016876220703125, "learning_rate": 2.9261896210718106e-06, "loss": 0.0014, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 156 }, { "completion_length": 748.8750305175781, "epoch": 0.19146341463414634, "grad_norm": 0.04764688387513161, "kl": 0.0169677734375, "learning_rate": 2.924198353410044e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 157 }, { "completion_length": 576.6666717529297, "epoch": 0.1926829268292683, "grad_norm": 0.5345750451087952, "kl": 0.02020263671875, "learning_rate": 2.9221812776000003e-06, "loss": 0.0161, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 158 }, { "completion_length": 717.7291870117188, "epoch": 0.19390243902439025, "grad_norm": 0.2683437168598175, "kl": 0.016448974609375, "learning_rate": 2.9201384301934632e-06, "loss": -0.0001, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 159 }, { "completion_length": 744.0833435058594, "epoch": 0.1951219512195122, "grad_norm": 0.05052180215716362, "kl": 0.0198974609375, "learning_rate": 2.9180698482092302e-06, "loss": 0.0007, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 160 }, { "completion_length": 791.9166870117188, "epoch": 0.19634146341463415, "grad_norm": 0.04119595140218735, "kl": 0.016937255859375, "learning_rate": 2.9159755691324377e-06, "loss": 0.0006, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 161 }, { "completion_length": 639.9375305175781, "epoch": 0.1975609756097561, "grad_norm": 0.37889334559440613, "kl": 0.020751953125, "learning_rate": 2.913855630913884e-06, "loss": -0.0038, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 162 }, { "completion_length": 868.8125, "epoch": 0.19878048780487806, "grad_norm": 0.27075132727622986, "kl": 0.01708984375, "learning_rate": 2.911710071969342e-06, "loss": 0.0158, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 163 }, { "completion_length": 890.9791870117188, "epoch": 0.2, "grad_norm": 0.285118043422699, "kl": 0.014984130859375, "learning_rate": 2.9095389311788626e-06, "loss": -0.0051, "reward": 0.2291666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 164 }, { "completion_length": 745.7916870117188, "epoch": 0.20121951219512196, "grad_norm": 0.4438501000404358, "kl": 0.016265869140625, "learning_rate": 2.9073422478860678e-06, "loss": -0.0643, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 165 }, { "completion_length": 664.1666870117188, "epoch": 0.20243902439024392, "grad_norm": 0.26312437653541565, "kl": 0.02099609375, "learning_rate": 2.9051200618974418e-06, "loss": 0.0026, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 166 }, { "completion_length": 731.6875, "epoch": 0.20365853658536584, "grad_norm": 0.34627678990364075, "kl": 0.01849365234375, "learning_rate": 2.9028724134816064e-06, "loss": -0.0197, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 167 }, { "completion_length": 688.1041870117188, "epoch": 0.2048780487804878, "grad_norm": 0.4049510657787323, "kl": 0.017578125, "learning_rate": 2.9005993433685932e-06, "loss": 0.013, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 168 }, { "completion_length": 683.3750305175781, "epoch": 0.20609756097560974, "grad_norm": 0.5772159099578857, "kl": 0.01739501953125, "learning_rate": 2.8983008927491046e-06, "loss": 0.0063, "reward": 0.2708333432674408, "reward_std": 0.10825318098068237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 169 }, { "completion_length": 693.4583740234375, "epoch": 0.2073170731707317, "grad_norm": 0.3611339032649994, "kl": 0.0157470703125, "learning_rate": 2.8959771032737673e-06, "loss": -0.0406, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 170 }, { "completion_length": 630.0000305175781, "epoch": 0.20853658536585365, "grad_norm": 0.33315309882164, "kl": 0.01806640625, "learning_rate": 2.8936280170523784e-06, "loss": -0.016, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 171 }, { "completion_length": 736.4166870117188, "epoch": 0.2097560975609756, "grad_norm": 0.5158050060272217, "kl": 0.0198974609375, "learning_rate": 2.8912536766531423e-06, "loss": -0.0491, "reward": 0.125, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 172 }, { "completion_length": 731.9166870117188, "epoch": 0.21097560975609755, "grad_norm": 0.22972215712070465, "kl": 0.017333984375, "learning_rate": 2.8888541251018963e-06, "loss": -0.0104, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 173 }, { "completion_length": 803.3333435058594, "epoch": 0.2121951219512195, "grad_norm": 0.7990434169769287, "kl": 0.02093505859375, "learning_rate": 2.8864294058813364e-06, "loss": -0.112, "reward": 0.2500000149011612, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 174 }, { "completion_length": 572.9583740234375, "epoch": 0.21341463414634146, "grad_norm": 0.4772682189941406, "kl": 0.02081298828125, "learning_rate": 2.883979562930225e-06, "loss": -0.0108, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 175 }, { "completion_length": 669.0208435058594, "epoch": 0.2146341463414634, "grad_norm": 0.05744696035981178, "kl": 0.02032470703125, "learning_rate": 2.8815046406425954e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 176 }, { "completion_length": 815.25, "epoch": 0.21585365853658536, "grad_norm": 0.2521149516105652, "kl": 0.01373291015625, "learning_rate": 2.8790046838669493e-06, "loss": 0.0314, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 177 }, { "completion_length": 688.3750305175781, "epoch": 0.21707317073170732, "grad_norm": 0.6815643906593323, "kl": 0.02545166015625, "learning_rate": 2.876479737905442e-06, "loss": -0.0403, "reward": 0.1041666679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 178 }, { "completion_length": 618.25, "epoch": 0.21829268292682927, "grad_norm": 0.5136005878448486, "kl": 0.02203369140625, "learning_rate": 2.8739298485130627e-06, "loss": -0.0078, "reward": 0.2083333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 179 }, { "completion_length": 671.2500305175781, "epoch": 0.21951219512195122, "grad_norm": 0.4481271803379059, "kl": 0.0186767578125, "learning_rate": 2.8713550618968034e-06, "loss": 0.0089, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 180 }, { "completion_length": 777.4791870117188, "epoch": 0.22073170731707317, "grad_norm": 0.3541518449783325, "kl": 0.02325439453125, "learning_rate": 2.8687554247148247e-06, "loss": 0.0262, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 181 }, { "completion_length": 725.6875305175781, "epoch": 0.22195121951219512, "grad_norm": 0.09448660165071487, "kl": 0.0201416015625, "learning_rate": 2.8661309840756093e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 182 }, { "completion_length": 852.5833435058594, "epoch": 0.22317073170731708, "grad_norm": 0.30420514941215515, "kl": 0.018310546875, "learning_rate": 2.863481787537105e-06, "loss": 0.005, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 183 }, { "completion_length": 676.0833435058594, "epoch": 0.22439024390243903, "grad_norm": 0.7469632029533386, "kl": 0.0177001953125, "learning_rate": 2.8608078831058682e-06, "loss": 0.015, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 184 }, { "completion_length": 737.5833740234375, "epoch": 0.22560975609756098, "grad_norm": 0.4696647822856903, "kl": 0.0260009765625, "learning_rate": 2.8581093192361895e-06, "loss": 0.0463, "reward": 0.2291666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 185 }, { "completion_length": 859.0, "epoch": 0.22682926829268293, "grad_norm": 0.04222070053219795, "kl": 0.0218505859375, "learning_rate": 2.8553861448292185e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 186 }, { "completion_length": 733.7708435058594, "epoch": 0.2280487804878049, "grad_norm": 0.4025222659111023, "kl": 0.0279541015625, "learning_rate": 2.852638409232077e-06, "loss": 0.0001, "reward": 0.1041666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 187 }, { "completion_length": 564.2916717529297, "epoch": 0.22926829268292684, "grad_norm": 0.32440370321273804, "kl": 0.0225830078125, "learning_rate": 2.8498661622369637e-06, "loss": 0.0085, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 188 }, { "completion_length": 830.2500305175781, "epoch": 0.2304878048780488, "grad_norm": 0.2527843117713928, "kl": 0.0234375, "learning_rate": 2.8470694540802527e-06, "loss": 0.0077, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 189 }, { "completion_length": 990.375, "epoch": 0.23170731707317074, "grad_norm": 0.5628884434700012, "kl": 0.04888916015625, "learning_rate": 2.8442483354415836e-06, "loss": 0.0041, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 190 }, { "completion_length": 761.8125, "epoch": 0.2329268292682927, "grad_norm": 0.4500414729118347, "kl": 0.02325439453125, "learning_rate": 2.841402857442942e-06, "loss": -0.0141, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 191 }, { "completion_length": 928.3958740234375, "epoch": 0.23414634146341465, "grad_norm": 0.32092925906181335, "kl": 0.0218505859375, "learning_rate": 2.8385330716477335e-06, "loss": 0.0019, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 192 }, { "completion_length": 524.9375305175781, "epoch": 0.23536585365853657, "grad_norm": 0.08342643827199936, "kl": 0.128875732421875, "learning_rate": 2.835639030059851e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 193 }, { "completion_length": 728.5833435058594, "epoch": 0.23658536585365852, "grad_norm": 0.43060678243637085, "kl": 0.0198974609375, "learning_rate": 2.8327207851227295e-06, "loss": 0.0183, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 194 }, { "completion_length": 804.6666870117188, "epoch": 0.23780487804878048, "grad_norm": 0.2729571461677551, "kl": 0.02264404296875, "learning_rate": 2.829778389718398e-06, "loss": 0.0081, "reward": 0.2708333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 195 }, { "completion_length": 701.6666870117188, "epoch": 0.23902439024390243, "grad_norm": 0.46106624603271484, "kl": 0.02728271484375, "learning_rate": 2.826811897166519e-06, "loss": -0.0018, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 196 }, { "completion_length": 764.1666870117188, "epoch": 0.24024390243902438, "grad_norm": 0.1949763149023056, "kl": 0.0191650390625, "learning_rate": 2.8238213612234255e-06, "loss": -0.0161, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 197 }, { "completion_length": 691.0625305175781, "epoch": 0.24146341463414633, "grad_norm": 0.05811993405222893, "kl": 0.0220947265625, "learning_rate": 2.8208068360811445e-06, "loss": 0.0008, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 198 }, { "completion_length": 981.8125, "epoch": 0.2426829268292683, "grad_norm": 0.40761807560920715, "kl": 0.02154541015625, "learning_rate": 2.8177683763664137e-06, "loss": -0.0305, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 199 }, { "completion_length": 962.7500305175781, "epoch": 0.24390243902439024, "grad_norm": 0.26158013939857483, "kl": 0.0181884765625, "learning_rate": 2.8147060371396953e-06, "loss": 0.034, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 200 }, { "completion_length": 662.2083435058594, "epoch": 0.2451219512195122, "grad_norm": 0.4212491512298584, "kl": 0.022216796875, "learning_rate": 2.8116198738941766e-06, "loss": -0.0079, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 201 }, { "completion_length": 836.3541870117188, "epoch": 0.24634146341463414, "grad_norm": 0.3516237437725067, "kl": 0.02349853515625, "learning_rate": 2.8085099425547627e-06, "loss": -0.004, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 202 }, { "completion_length": 744.4583435058594, "epoch": 0.2475609756097561, "grad_norm": 0.5145571827888489, "kl": 0.02130126953125, "learning_rate": 2.8053762994770646e-06, "loss": -0.0356, "reward": 0.16666667722165585, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 203 }, { "completion_length": 899.5416870117188, "epoch": 0.24878048780487805, "grad_norm": 1.0642642974853516, "kl": 0.0482177734375, "learning_rate": 2.8022190014463794e-06, "loss": 0.0028, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 204 }, { "completion_length": 656.0625, "epoch": 0.25, "grad_norm": 0.4658428132534027, "kl": 0.0225830078125, "learning_rate": 2.7990381056766585e-06, "loss": 0.0129, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 205 }, { "completion_length": 529.6041870117188, "epoch": 0.25121951219512195, "grad_norm": 0.6593291163444519, "kl": 0.02252197265625, "learning_rate": 2.795833669809471e-06, "loss": 0.0031, "reward": 0.2708333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 206 }, { "completion_length": 883.5833740234375, "epoch": 0.2524390243902439, "grad_norm": 0.4920080900192261, "kl": 0.02020263671875, "learning_rate": 2.7926057519129634e-06, "loss": 0.0473, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 207 }, { "completion_length": 704.2083435058594, "epoch": 0.25365853658536586, "grad_norm": 0.3727148771286011, "kl": 0.01800537109375, "learning_rate": 2.7893544104808017e-06, "loss": -0.0068, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 208 }, { "completion_length": 555.1458587646484, "epoch": 0.2548780487804878, "grad_norm": 0.6752776503562927, "kl": 0.023681640625, "learning_rate": 2.7860797044311143e-06, "loss": 0.0138, "reward": 0.27083333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.27083333395421505, "rewards/format_reward": 0.0, "step": 209 }, { "completion_length": 644.2708740234375, "epoch": 0.25609756097560976, "grad_norm": 0.35868731141090393, "kl": 0.020751953125, "learning_rate": 2.7827816931054245e-06, "loss": -0.0067, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 210 }, { "completion_length": 760.9583740234375, "epoch": 0.2573170731707317, "grad_norm": 0.4104251265525818, "kl": 0.0220947265625, "learning_rate": 2.7794604362675733e-06, "loss": -0.0301, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 211 }, { "completion_length": 791.0625, "epoch": 0.25853658536585367, "grad_norm": 0.51336669921875, "kl": 0.02239990234375, "learning_rate": 2.7761159941026403e-06, "loss": 0.0342, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 212 }, { "completion_length": 577.2916870117188, "epoch": 0.2597560975609756, "grad_norm": 0.2733917534351349, "kl": 0.024169921875, "learning_rate": 2.772748427215848e-06, "loss": 0.0023, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 213 }, { "completion_length": 569.5000152587891, "epoch": 0.26097560975609757, "grad_norm": 0.63326495885849, "kl": 0.02398681640625, "learning_rate": 2.7693577966314664e-06, "loss": -0.0395, "reward": 0.2291666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 214 }, { "completion_length": 731.6875305175781, "epoch": 0.2621951219512195, "grad_norm": 0.5346475839614868, "kl": 0.0211181640625, "learning_rate": 2.7659441637917076e-06, "loss": 0.0211, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 215 }, { "completion_length": 870.7500610351562, "epoch": 0.2634146341463415, "grad_norm": 0.5171618461608887, "kl": 0.0230712890625, "learning_rate": 2.7625075905556117e-06, "loss": 0.0235, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 216 }, { "completion_length": 658.0, "epoch": 0.2646341463414634, "grad_norm": 0.2690303325653076, "kl": 0.03155517578125, "learning_rate": 2.7590481391979253e-06, "loss": 0.0162, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 217 }, { "completion_length": 802.6666870117188, "epoch": 0.2658536585365854, "grad_norm": 0.3439900875091553, "kl": 0.02734375, "learning_rate": 2.755565872407973e-06, "loss": 0.004, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 218 }, { "completion_length": 749.9375, "epoch": 0.26707317073170733, "grad_norm": 0.5028407573699951, "kl": 0.02398681640625, "learning_rate": 2.7520608532885228e-06, "loss": -0.0342, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 219 }, { "completion_length": 770.5, "epoch": 0.2682926829268293, "grad_norm": 0.1258758008480072, "kl": 0.0250244140625, "learning_rate": 2.7485331453546407e-06, "loss": 0.0009, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 220 }, { "completion_length": 976.0208740234375, "epoch": 0.26951219512195124, "grad_norm": 0.22799670696258545, "kl": 0.02197265625, "learning_rate": 2.744982812532542e-06, "loss": 0.0478, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 221 }, { "completion_length": 659.6458435058594, "epoch": 0.2707317073170732, "grad_norm": 0.35914433002471924, "kl": 0.0286865234375, "learning_rate": 2.7414099191584305e-06, "loss": -0.0094, "reward": 0.4166666865348816, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.4166666865348816, "rewards/format_reward": 0.0, "step": 222 }, { "completion_length": 730.8125, "epoch": 0.27195121951219514, "grad_norm": 0.4243104159832001, "kl": 0.02203369140625, "learning_rate": 2.7378145299773337e-06, "loss": 0.0084, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 223 }, { "completion_length": 587.3125305175781, "epoch": 0.2731707317073171, "grad_norm": 0.4017314016819, "kl": 0.02728271484375, "learning_rate": 2.7341967101419303e-06, "loss": 0.0112, "reward": 0.2500000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 224 }, { "completion_length": 661.5625305175781, "epoch": 0.27439024390243905, "grad_norm": 0.3526459038257599, "kl": 0.02374267578125, "learning_rate": 2.730556525211368e-06, "loss": -0.0158, "reward": 0.1875000111758709, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 225 }, { "completion_length": 649.7083435058594, "epoch": 0.275609756097561, "grad_norm": 0.12818405032157898, "kl": 0.0234375, "learning_rate": 2.726894041150077e-06, "loss": 0.001, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 226 }, { "completion_length": 956.9791870117188, "epoch": 0.27682926829268295, "grad_norm": 2.025303363800049, "kl": 0.09014892578125, "learning_rate": 2.7232093243265727e-06, "loss": 0.0229, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 227 }, { "completion_length": 624.3125305175781, "epoch": 0.2780487804878049, "grad_norm": 0.2305293083190918, "kl": 0.02728271484375, "learning_rate": 2.7195024415122565e-06, "loss": 0.0024, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 228 }, { "completion_length": 624.2916870117188, "epoch": 0.27926829268292686, "grad_norm": 0.5692446231842041, "kl": 0.02569580078125, "learning_rate": 2.715773459880202e-06, "loss": -0.0621, "reward": 0.2083333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 229 }, { "completion_length": 821.6875, "epoch": 0.2804878048780488, "grad_norm": 0.7158800959587097, "kl": 0.02783203125, "learning_rate": 2.7120224470039394e-06, "loss": -0.0085, "reward": 0.1458333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 230 }, { "completion_length": 608.7916870117188, "epoch": 0.2817073170731707, "grad_norm": 0.5108224153518677, "kl": 0.02252197265625, "learning_rate": 2.7082494708562316e-06, "loss": 0.0071, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 231 }, { "completion_length": 690.5833435058594, "epoch": 0.28292682926829266, "grad_norm": 0.3444475531578064, "kl": 0.0269775390625, "learning_rate": 2.7044545998078414e-06, "loss": 0.0132, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 232 }, { "completion_length": 780.3333740234375, "epoch": 0.2841463414634146, "grad_norm": 0.1466454416513443, "kl": 0.0250244140625, "learning_rate": 2.7006379026262924e-06, "loss": 0.0002, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 233 }, { "completion_length": 595.2083740234375, "epoch": 0.28536585365853656, "grad_norm": 0.051908962428569794, "kl": 0.08935546875, "learning_rate": 2.696799448474625e-06, "loss": 0.001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 234 }, { "completion_length": 857.4166870117188, "epoch": 0.2865853658536585, "grad_norm": 0.3672059178352356, "kl": 0.030029296875, "learning_rate": 2.69293930691014e-06, "loss": 0.0098, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 235 }, { "completion_length": 766.4791870117188, "epoch": 0.28780487804878047, "grad_norm": 0.13399188220500946, "kl": 0.02374267578125, "learning_rate": 2.689057547883139e-06, "loss": 0.0001, "reward": 0.20833333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 236 }, { "completion_length": 666.7500305175781, "epoch": 0.2890243902439024, "grad_norm": 0.5673995614051819, "kl": 0.02789306640625, "learning_rate": 2.6851542417356605e-06, "loss": -0.0061, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 237 }, { "completion_length": 721.4375305175781, "epoch": 0.29024390243902437, "grad_norm": 0.31549733877182007, "kl": 0.02764892578125, "learning_rate": 2.6812294592001984e-06, "loss": -0.0241, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 238 }, { "completion_length": 702.8333435058594, "epoch": 0.2914634146341463, "grad_norm": 0.4895757734775543, "kl": 0.0269775390625, "learning_rate": 2.677283271398427e-06, "loss": 0.0011, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 239 }, { "completion_length": 734.9791870117188, "epoch": 0.2926829268292683, "grad_norm": 0.28384703397750854, "kl": 0.03662109375, "learning_rate": 2.673315749839907e-06, "loss": -0.0144, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 240 }, { "completion_length": 763.6666870117188, "epoch": 0.2939024390243902, "grad_norm": 0.4405684173107147, "kl": 0.027099609375, "learning_rate": 2.669326966420793e-06, "loss": 0.0024, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 241 }, { "completion_length": 674.2708435058594, "epoch": 0.2951219512195122, "grad_norm": 0.6043628454208374, "kl": 0.03167724609375, "learning_rate": 2.6653169934225295e-06, "loss": -0.0699, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 242 }, { "completion_length": 645.2708435058594, "epoch": 0.29634146341463413, "grad_norm": 0.5713904500007629, "kl": 0.02581787109375, "learning_rate": 2.661285903510541e-06, "loss": -0.0441, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 243 }, { "completion_length": 781.5000305175781, "epoch": 0.2975609756097561, "grad_norm": 0.6145543456077576, "kl": 0.022705078125, "learning_rate": 2.6572337697329145e-06, "loss": -0.008, "reward": 0.2500000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 244 }, { "completion_length": 862.4375305175781, "epoch": 0.29878048780487804, "grad_norm": 0.36308160424232483, "kl": 0.02362060546875, "learning_rate": 2.6531606655190777e-06, "loss": 0.0404, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 245 }, { "completion_length": 779.2500305175781, "epoch": 0.3, "grad_norm": 0.5605431795120239, "kl": 0.0257568359375, "learning_rate": 2.649066664678467e-06, "loss": 0.0311, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 246 }, { "completion_length": 854.9375305175781, "epoch": 0.30121951219512194, "grad_norm": 0.4492291212081909, "kl": 0.02496337890625, "learning_rate": 2.64495184139919e-06, "loss": 0.0258, "reward": 0.125, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 247 }, { "completion_length": 865.6250610351562, "epoch": 0.3024390243902439, "grad_norm": 0.71879643201828, "kl": 0.02789306640625, "learning_rate": 2.640816270246681e-06, "loss": 0.0375, "reward": 0.1458333395421505, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 248 }, { "completion_length": 1087.0000305175781, "epoch": 0.30365853658536585, "grad_norm": 0.2714973986148834, "kl": 0.02093505859375, "learning_rate": 2.636660026162351e-06, "loss": -0.0026, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 249 }, { "completion_length": 963.3750305175781, "epoch": 0.3048780487804878, "grad_norm": 0.5230698585510254, "kl": 0.0335693359375, "learning_rate": 2.6324831844622278e-06, "loss": 0.0096, "reward": 0.1458333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 250 }, { "completion_length": 855.3541870117188, "epoch": 0.30609756097560975, "grad_norm": 0.34206509590148926, "kl": 0.0244140625, "learning_rate": 2.628285820835593e-06, "loss": 0.0278, "reward": 0.27083333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.27083333395421505, "rewards/format_reward": 0.0, "step": 251 }, { "completion_length": 850.0417175292969, "epoch": 0.3073170731707317, "grad_norm": 0.35433900356292725, "kl": 0.025390625, "learning_rate": 2.6240680113436096e-06, "loss": -0.0119, "reward": 0.0833333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 252 }, { "completion_length": 1090.8750610351562, "epoch": 0.30853658536585366, "grad_norm": 0.2618762254714966, "kl": 0.02801513671875, "learning_rate": 2.619829832417944e-06, "loss": 0.0758, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 253 }, { "completion_length": 679.8541870117188, "epoch": 0.3097560975609756, "grad_norm": 0.6165598034858704, "kl": 0.02508544921875, "learning_rate": 2.6155713608593796e-06, "loss": 0.0086, "reward": 0.3125000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 254 }, { "completion_length": 809.3125305175781, "epoch": 0.31097560975609756, "grad_norm": 0.2922210991382599, "kl": 0.02557373046875, "learning_rate": 2.6112926738364267e-06, "loss": 0.036, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 255 }, { "completion_length": 780.2291870117188, "epoch": 0.3121951219512195, "grad_norm": 0.4339911937713623, "kl": 0.03070068359375, "learning_rate": 2.606993848883924e-06, "loss": 0.041, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 256 }, { "completion_length": 943.6458740234375, "epoch": 0.31341463414634146, "grad_norm": 0.19445890188217163, "kl": 0.02685546875, "learning_rate": 2.6026749639016327e-06, "loss": 0.0082, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 257 }, { "completion_length": 870.3750305175781, "epoch": 0.3146341463414634, "grad_norm": 0.36287394165992737, "kl": 0.031494140625, "learning_rate": 2.5983360971528252e-06, "loss": 0.0174, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 258 }, { "completion_length": 649.9375, "epoch": 0.31585365853658537, "grad_norm": 0.5976565480232239, "kl": 0.02911376953125, "learning_rate": 2.5939773272628674e-06, "loss": 0.0043, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 259 }, { "completion_length": 733.9166870117188, "epoch": 0.3170731707317073, "grad_norm": 0.6212018728256226, "kl": 0.02813720703125, "learning_rate": 2.5895987332177935e-06, "loss": -0.0088, "reward": 0.125, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 260 }, { "completion_length": 699.7291870117188, "epoch": 0.3182926829268293, "grad_norm": 0.24755185842514038, "kl": 0.02996826171875, "learning_rate": 2.5852003943628746e-06, "loss": 0.0008, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 261 }, { "completion_length": 832.0625305175781, "epoch": 0.3195121951219512, "grad_norm": 0.28362536430358887, "kl": 0.0272216796875, "learning_rate": 2.5807823904011804e-06, "loss": 0.0071, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 262 }, { "completion_length": 752.9375, "epoch": 0.3207317073170732, "grad_norm": 0.6556203365325928, "kl": 0.0247802734375, "learning_rate": 2.576344801392137e-06, "loss": -0.006, "reward": 0.2500000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 263 }, { "completion_length": 812.4791870117188, "epoch": 0.32195121951219513, "grad_norm": 0.5754515528678894, "kl": 0.02923583984375, "learning_rate": 2.571887707750072e-06, "loss": -0.0423, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 264 }, { "completion_length": 936.2708435058594, "epoch": 0.3231707317073171, "grad_norm": 0.26100462675094604, "kl": 0.02996826171875, "learning_rate": 2.5674111902427625e-06, "loss": 0.023, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 265 }, { "completion_length": 683.9375305175781, "epoch": 0.32439024390243903, "grad_norm": 0.24268393218517303, "kl": 0.02978515625, "learning_rate": 2.5629153299899673e-06, "loss": -0.0018, "reward": 0.2291666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 266 }, { "completion_length": 757.6458435058594, "epoch": 0.325609756097561, "grad_norm": 0.5983391404151917, "kl": 0.05010986328125, "learning_rate": 2.5584002084619593e-06, "loss": 0.0316, "reward": 0.2916666716337204, "reward_std": 0.21650636196136475, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 267 }, { "completion_length": 1014.1458740234375, "epoch": 0.32682926829268294, "grad_norm": 0.23932863771915436, "kl": 0.028076171875, "learning_rate": 2.5538659074780484e-06, "loss": 0.0211, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 268 }, { "completion_length": 748.7708435058594, "epoch": 0.3280487804878049, "grad_norm": 0.4234470725059509, "kl": 0.03076171875, "learning_rate": 2.549312509205097e-06, "loss": 0.0318, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 269 }, { "completion_length": 779.1041870117188, "epoch": 0.32926829268292684, "grad_norm": 0.5329450964927673, "kl": 0.03021240234375, "learning_rate": 2.5447400961560355e-06, "loss": -0.0543, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 270 }, { "completion_length": 728.125, "epoch": 0.3304878048780488, "grad_norm": 0.5748668313026428, "kl": 0.0338134765625, "learning_rate": 2.5401487511883627e-06, "loss": -0.0385, "reward": 0.1875, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 271 }, { "completion_length": 635.1666870117188, "epoch": 0.33170731707317075, "grad_norm": 0.7328594326972961, "kl": 0.02838134765625, "learning_rate": 2.5355385575026464e-06, "loss": 0.0339, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 272 }, { "completion_length": 786.9166870117188, "epoch": 0.3329268292682927, "grad_norm": 0.056253425776958466, "kl": 0.0277099609375, "learning_rate": 2.5309095986410155e-06, "loss": 0.001, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 273 }, { "completion_length": 575.5208587646484, "epoch": 0.33414634146341465, "grad_norm": 0.05611734464764595, "kl": 0.0244140625, "learning_rate": 2.5262619584856456e-06, "loss": 0.0009, "reward": 0.3125, "reward_std": 0.0, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 274 }, { "completion_length": 632.1666870117188, "epoch": 0.3353658536585366, "grad_norm": 0.06363707035779953, "kl": 0.0218505859375, "learning_rate": 2.52159572125724e-06, "loss": 0.0008, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 275 }, { "completion_length": 915.6458435058594, "epoch": 0.33658536585365856, "grad_norm": 0.5183939933776855, "kl": 0.027587890625, "learning_rate": 2.5169109715135015e-06, "loss": 0.0111, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 276 }, { "completion_length": 791.625, "epoch": 0.3378048780487805, "grad_norm": 0.32279711961746216, "kl": 0.02484130859375, "learning_rate": 2.512207794147603e-06, "loss": 0.0133, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 277 }, { "completion_length": 712.0000305175781, "epoch": 0.33902439024390246, "grad_norm": 0.18284721672534943, "kl": 0.0255126953125, "learning_rate": 2.507486274386647e-06, "loss": -0.0013, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 278 }, { "completion_length": 719.6041870117188, "epoch": 0.3402439024390244, "grad_norm": 0.3969678580760956, "kl": 0.0311279296875, "learning_rate": 2.5027464977901206e-06, "loss": -0.0471, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 279 }, { "completion_length": 662.8125305175781, "epoch": 0.34146341463414637, "grad_norm": 0.278129518032074, "kl": 0.02740478515625, "learning_rate": 2.4979885502483478e-06, "loss": -0.0116, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 280 }, { "completion_length": 631.0833435058594, "epoch": 0.3426829268292683, "grad_norm": 0.49812057614326477, "kl": 0.0302734375, "learning_rate": 2.4932125179809316e-06, "loss": -0.0037, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 281 }, { "completion_length": 840.5208740234375, "epoch": 0.3439024390243902, "grad_norm": 0.6025025248527527, "kl": 0.03045654296875, "learning_rate": 2.4884184875351897e-06, "loss": 0.0369, "reward": 0.1458333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 282 }, { "completion_length": 601.2500305175781, "epoch": 0.34512195121951217, "grad_norm": 0.2603875696659088, "kl": 0.03564453125, "learning_rate": 2.48360654578459e-06, "loss": 0.0017, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 283 }, { "completion_length": 605.9791870117188, "epoch": 0.3463414634146341, "grad_norm": 0.4111523926258087, "kl": 0.02874755859375, "learning_rate": 2.4787767799271725e-06, "loss": 0.0172, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 284 }, { "completion_length": 581.3125, "epoch": 0.3475609756097561, "grad_norm": 0.3759603798389435, "kl": 0.03076171875, "learning_rate": 2.473929277483972e-06, "loss": -0.0094, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 285 }, { "completion_length": 815.0625305175781, "epoch": 0.348780487804878, "grad_norm": 0.30721497535705566, "kl": 0.0318603515625, "learning_rate": 2.4690641262974317e-06, "loss": 0.0639, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 286 }, { "completion_length": 812.2708435058594, "epoch": 0.35, "grad_norm": 0.05051800608634949, "kl": 0.02484130859375, "learning_rate": 2.464181414529809e-06, "loss": 0.001, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 287 }, { "completion_length": 760.7291870117188, "epoch": 0.35121951219512193, "grad_norm": 0.3336050510406494, "kl": 0.03076171875, "learning_rate": 2.4592812306615812e-06, "loss": -0.0171, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 288 }, { "completion_length": 664.7083740234375, "epoch": 0.3524390243902439, "grad_norm": 0.5336496829986572, "kl": 0.03125, "learning_rate": 2.4543636634898398e-06, "loss": 0.0195, "reward": 0.1041666679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 289 }, { "completion_length": 866.4375, "epoch": 0.35365853658536583, "grad_norm": 0.29412227869033813, "kl": 0.02923583984375, "learning_rate": 2.4494288021266825e-06, "loss": 0.0126, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 290 }, { "completion_length": 922.4791870117188, "epoch": 0.3548780487804878, "grad_norm": 0.62317955493927, "kl": 0.0618896484375, "learning_rate": 2.444476735997598e-06, "loss": 0.0498, "reward": 0.2708333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 291 }, { "completion_length": 728.375, "epoch": 0.35609756097560974, "grad_norm": 0.48821818828582764, "kl": 0.03179931640625, "learning_rate": 2.439507554839846e-06, "loss": -0.0207, "reward": 0.2083333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 292 }, { "completion_length": 655.6458740234375, "epoch": 0.3573170731707317, "grad_norm": 0.3668544888496399, "kl": 0.028076171875, "learning_rate": 2.4345213487008296e-06, "loss": -0.0002, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 293 }, { "completion_length": 563.5625305175781, "epoch": 0.35853658536585364, "grad_norm": 0.2510969340801239, "kl": 0.029296875, "learning_rate": 2.4295182079364655e-06, "loss": 0.0075, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 294 }, { "completion_length": 640.6458740234375, "epoch": 0.3597560975609756, "grad_norm": 0.4731411635875702, "kl": 0.0267333984375, "learning_rate": 2.424498223209545e-06, "loss": 0.0057, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 295 }, { "completion_length": 784.1875305175781, "epoch": 0.36097560975609755, "grad_norm": 0.43168067932128906, "kl": 0.03045654296875, "learning_rate": 2.4194614854880937e-06, "loss": -0.0009, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 296 }, { "completion_length": 666.4791870117188, "epoch": 0.3621951219512195, "grad_norm": 0.41461437940597534, "kl": 0.0250244140625, "learning_rate": 2.4144080860437184e-06, "loss": 0.0125, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 297 }, { "completion_length": 742.7291870117188, "epoch": 0.36341463414634145, "grad_norm": 0.056942686438560486, "kl": 0.026123046875, "learning_rate": 2.409338116449957e-06, "loss": 0.001, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 298 }, { "completion_length": 702.1458435058594, "epoch": 0.3646341463414634, "grad_norm": 0.5765194296836853, "kl": 0.0224609375, "learning_rate": 2.404251668580619e-06, "loss": 0.0231, "reward": 0.3333333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 299 }, { "completion_length": 716.8333435058594, "epoch": 0.36585365853658536, "grad_norm": 0.5342187881469727, "kl": 0.02703857421875, "learning_rate": 2.3991488346081183e-06, "loss": -0.0256, "reward": 0.2916666865348816, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.0, "step": 300 }, { "completion_length": 646.7291870117188, "epoch": 0.3670731707317073, "grad_norm": 0.07587277144193649, "kl": 0.0301513671875, "learning_rate": 2.3940297070018048e-06, "loss": 0.0012, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 301 }, { "completion_length": 719.8541870117188, "epoch": 0.36829268292682926, "grad_norm": 0.1976253092288971, "kl": 0.02813720703125, "learning_rate": 2.388894378526288e-06, "loss": 0.0088, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 302 }, { "completion_length": 654.0833740234375, "epoch": 0.3695121951219512, "grad_norm": 0.5830801725387573, "kl": 0.03106689453125, "learning_rate": 2.383742942239757e-06, "loss": 0.02, "reward": 0.1666666716337204, "reward_std": 0.14433757960796356, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 303 }, { "completion_length": 631.0, "epoch": 0.37073170731707317, "grad_norm": 1.7362228631973267, "kl": 0.0513916015625, "learning_rate": 2.3785754914922923e-06, "loss": 0.0032, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 304 }, { "completion_length": 704.5833435058594, "epoch": 0.3719512195121951, "grad_norm": 0.7385122776031494, "kl": 0.0291748046875, "learning_rate": 2.3733921199241755e-06, "loss": -0.0092, "reward": 0.2083333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 305 }, { "completion_length": 819.3750305175781, "epoch": 0.37317073170731707, "grad_norm": 0.3535645008087158, "kl": 0.03369140625, "learning_rate": 2.3681929214641924e-06, "loss": 0.0263, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 306 }, { "completion_length": 774.2083435058594, "epoch": 0.374390243902439, "grad_norm": 0.48355501890182495, "kl": 0.03045654296875, "learning_rate": 2.362977990327931e-06, "loss": -0.0385, "reward": 0.2500000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 307 }, { "completion_length": 787.75, "epoch": 0.375609756097561, "grad_norm": 0.5030492544174194, "kl": 0.02679443359375, "learning_rate": 2.357747421016073e-06, "loss": -0.07, "reward": 0.125, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 308 }, { "completion_length": 892.0625305175781, "epoch": 0.37682926829268293, "grad_norm": 0.1832209974527359, "kl": 0.02886962890625, "learning_rate": 2.3525013083126835e-06, "loss": -0.0045, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 309 }, { "completion_length": 713.125, "epoch": 0.3780487804878049, "grad_norm": 0.3876541554927826, "kl": 0.032470703125, "learning_rate": 2.34723974728349e-06, "loss": 0.0125, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 310 }, { "completion_length": 814.3541870117188, "epoch": 0.37926829268292683, "grad_norm": 0.2946406900882721, "kl": 0.03204345703125, "learning_rate": 2.341962833274165e-06, "loss": 0.0051, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 311 }, { "completion_length": 808.9583740234375, "epoch": 0.3804878048780488, "grad_norm": 0.1276874542236328, "kl": 0.03515625, "learning_rate": 2.336670661908592e-06, "loss": 0.0081, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 312 }, { "completion_length": 950.375, "epoch": 0.38170731707317074, "grad_norm": 0.2518679201602936, "kl": 0.02764892578125, "learning_rate": 2.3313633290871373e-06, "loss": -0.0234, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 313 }, { "completion_length": 734.1666870117188, "epoch": 0.3829268292682927, "grad_norm": 0.32292279601097107, "kl": 0.033447265625, "learning_rate": 2.3260409309849103e-06, "loss": -0.0036, "reward": 0.1875, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 314 }, { "completion_length": 892.4166870117188, "epoch": 0.38414634146341464, "grad_norm": 0.2906545400619507, "kl": 0.0340576171875, "learning_rate": 2.3207035640500206e-06, "loss": -0.0361, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 315 }, { "completion_length": 826.3333435058594, "epoch": 0.3853658536585366, "grad_norm": 0.500372052192688, "kl": 0.03460693359375, "learning_rate": 2.315351325001832e-06, "loss": 0.0285, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 316 }, { "completion_length": 968.1666870117188, "epoch": 0.38658536585365855, "grad_norm": 0.15128959715366364, "kl": 0.02838134765625, "learning_rate": 2.3099843108292062e-06, "loss": 0.0349, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 317 }, { "completion_length": 1018.5625, "epoch": 0.3878048780487805, "grad_norm": 0.25746986269950867, "kl": 0.0301513671875, "learning_rate": 2.3046026187887498e-06, "loss": -0.0357, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 318 }, { "completion_length": 822.1041870117188, "epoch": 0.38902439024390245, "grad_norm": 0.2673456072807312, "kl": 0.03369140625, "learning_rate": 2.2992063464030482e-06, "loss": -0.0471, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 319 }, { "completion_length": 761.8958435058594, "epoch": 0.3902439024390244, "grad_norm": 0.10515403747558594, "kl": 0.03021240234375, "learning_rate": 2.293795591458901e-06, "loss": 0.0011, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 320 }, { "completion_length": 594.7291870117188, "epoch": 0.39146341463414636, "grad_norm": 0.32800784707069397, "kl": 0.03155517578125, "learning_rate": 2.288370452005547e-06, "loss": -0.0235, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 321 }, { "completion_length": 684.8541870117188, "epoch": 0.3926829268292683, "grad_norm": 0.0611780546605587, "kl": 0.02734375, "learning_rate": 2.2829310263528907e-06, "loss": 0.001, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 322 }, { "completion_length": 779.6041870117188, "epoch": 0.39390243902439026, "grad_norm": 0.35459983348846436, "kl": 0.02886962890625, "learning_rate": 2.2774774130697184e-06, "loss": 0.0159, "reward": 0.2083333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 323 }, { "completion_length": 792.125, "epoch": 0.3951219512195122, "grad_norm": 0.49110984802246094, "kl": 0.03131103515625, "learning_rate": 2.2720097109819135e-06, "loss": 0.048, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 324 }, { "completion_length": 781.1458435058594, "epoch": 0.39634146341463417, "grad_norm": 0.9487172365188599, "kl": 0.0322265625, "learning_rate": 2.2665280191706656e-06, "loss": 0.0379, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 325 }, { "completion_length": 823.6250305175781, "epoch": 0.3975609756097561, "grad_norm": 0.45459306240081787, "kl": 0.03363037109375, "learning_rate": 2.2610324369706735e-06, "loss": 0.0376, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 326 }, { "completion_length": 528.4791717529297, "epoch": 0.39878048780487807, "grad_norm": 0.35636627674102783, "kl": 0.03955078125, "learning_rate": 2.2555230639683464e-06, "loss": 0.0086, "reward": 0.2708333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 327 }, { "completion_length": 632.2083435058594, "epoch": 0.4, "grad_norm": 0.7059880495071411, "kl": 0.02734375, "learning_rate": 2.25e-06, "loss": -0.0038, "reward": 0.2083333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 328 }, { "completion_length": 597.8541717529297, "epoch": 0.401219512195122, "grad_norm": 0.45517703890800476, "kl": 0.03338623046875, "learning_rate": 2.2444633451500453e-06, "loss": 0.0128, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 329 }, { "completion_length": 750.7083435058594, "epoch": 0.4024390243902439, "grad_norm": 0.07014621794223785, "kl": 0.02850341796875, "learning_rate": 2.2389131997491756e-06, "loss": 0.001, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 330 }, { "completion_length": 736.0625, "epoch": 0.4036585365853659, "grad_norm": 0.37191396951675415, "kl": 0.02838134765625, "learning_rate": 2.2333496643725505e-06, "loss": 0.0431, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 331 }, { "completion_length": 845.6250305175781, "epoch": 0.40487804878048783, "grad_norm": 0.052367597818374634, "kl": 0.0240478515625, "learning_rate": 2.2277728398379705e-06, "loss": 0.0009, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 332 }, { "completion_length": 994.6250610351562, "epoch": 0.4060975609756098, "grad_norm": 0.31657665967941284, "kl": 0.03240966796875, "learning_rate": 2.2221828272040517e-06, "loss": 0.0022, "reward": 0.0833333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 333 }, { "completion_length": 674.9791870117188, "epoch": 0.4073170731707317, "grad_norm": 0.05391751974821091, "kl": 0.02685546875, "learning_rate": 2.2165797277683943e-06, "loss": 0.001, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 334 }, { "completion_length": 672.7291870117188, "epoch": 0.40853658536585363, "grad_norm": 0.2743265628814697, "kl": 0.03350830078125, "learning_rate": 2.2109636430657463e-06, "loss": 0.0015, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 335 }, { "completion_length": 604.9375, "epoch": 0.4097560975609756, "grad_norm": 0.40125370025634766, "kl": 0.03033447265625, "learning_rate": 2.2053346748661633e-06, "loss": 0.0156, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 336 }, { "completion_length": 1189.8125, "epoch": 0.41097560975609754, "grad_norm": 0.13064952194690704, "kl": 0.02838134765625, "learning_rate": 2.1996929251731665e-06, "loss": 0.002, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 337 }, { "completion_length": 603.6875305175781, "epoch": 0.4121951219512195, "grad_norm": 0.7183840274810791, "kl": 0.0283203125, "learning_rate": 2.194038496221892e-06, "loss": 0.0167, "reward": 0.2083333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 338 }, { "completion_length": 777.9375305175781, "epoch": 0.41341463414634144, "grad_norm": 0.042287491261959076, "kl": 0.0262451171875, "learning_rate": 2.188371490477239e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 339 }, { "completion_length": 842.4792175292969, "epoch": 0.4146341463414634, "grad_norm": 0.29391470551490784, "kl": 0.0264892578125, "learning_rate": 2.182692010632013e-06, "loss": 0.0147, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 340 }, { "completion_length": 677.5833435058594, "epoch": 0.41585365853658535, "grad_norm": 0.4390711784362793, "kl": 0.0345458984375, "learning_rate": 2.177000159605065e-06, "loss": 0.0028, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 341 }, { "completion_length": 760.4791870117188, "epoch": 0.4170731707317073, "grad_norm": 0.3324912190437317, "kl": 0.02978515625, "learning_rate": 2.1712960405394265e-06, "loss": -0.0057, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 342 }, { "completion_length": 658.0208435058594, "epoch": 0.41829268292682925, "grad_norm": 0.42109569907188416, "kl": 0.0302734375, "learning_rate": 2.1655797568004397e-06, "loss": 0.0047, "reward": 0.2500000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 343 }, { "completion_length": 844.0208435058594, "epoch": 0.4195121951219512, "grad_norm": 0.5153623819351196, "kl": 0.029296875, "learning_rate": 2.1598514119738853e-06, "loss": 0.0467, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 344 }, { "completion_length": 1024.7292175292969, "epoch": 0.42073170731707316, "grad_norm": 0.32257041335105896, "kl": 0.0279541015625, "learning_rate": 2.154111109864105e-06, "loss": 0.0028, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 345 }, { "completion_length": 634.3541870117188, "epoch": 0.4219512195121951, "grad_norm": 0.43821436166763306, "kl": 0.0289306640625, "learning_rate": 2.1483589544921202e-06, "loss": 0.0102, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 346 }, { "completion_length": 988.8750305175781, "epoch": 0.42317073170731706, "grad_norm": 0.2754349410533905, "kl": 0.02734375, "learning_rate": 2.1425950500937493e-06, "loss": 0.0076, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 347 }, { "completion_length": 675.8541870117188, "epoch": 0.424390243902439, "grad_norm": 1.2542449235916138, "kl": 0.03582763671875, "learning_rate": 2.1368195011177142e-06, "loss": 0.0095, "reward": 0.2291666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 348 }, { "completion_length": 866.3750305175781, "epoch": 0.42560975609756097, "grad_norm": 0.4978950023651123, "kl": 0.03076171875, "learning_rate": 2.1310324122237512e-06, "loss": 0.0125, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 349 }, { "completion_length": 871.4375305175781, "epoch": 0.4268292682926829, "grad_norm": 0.4244663417339325, "kl": 0.02947998046875, "learning_rate": 2.125233888280715e-06, "loss": -0.0582, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 350 }, { "completion_length": 857.8333740234375, "epoch": 0.42804878048780487, "grad_norm": 0.5104000568389893, "kl": 0.02020263671875, "learning_rate": 2.1194240343646732e-06, "loss": -0.0086, "reward": 0.2500000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 351 }, { "completion_length": 779.0625305175781, "epoch": 0.4292682926829268, "grad_norm": 0.2433476448059082, "kl": 0.03094482421875, "learning_rate": 2.11360295575701e-06, "loss": 0.0085, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 352 }, { "completion_length": 1070.5625610351562, "epoch": 0.4304878048780488, "grad_norm": 0.24915798008441925, "kl": 0.026123046875, "learning_rate": 2.1077707579425114e-06, "loss": 0.0376, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 353 }, { "completion_length": 953.8541870117188, "epoch": 0.4317073170731707, "grad_norm": 0.4156853258609772, "kl": 0.0296630859375, "learning_rate": 2.1019275466074585e-06, "loss": 0.0097, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 354 }, { "completion_length": 1144.7916870117188, "epoch": 0.4329268292682927, "grad_norm": 0.2654878795146942, "kl": 0.0277099609375, "learning_rate": 2.0960734276377082e-06, "loss": -0.0253, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 355 }, { "completion_length": 1072.1458740234375, "epoch": 0.43414634146341463, "grad_norm": 0.2536885440349579, "kl": 0.05084228515625, "learning_rate": 2.0902085071167774e-06, "loss": -0.0073, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 356 }, { "completion_length": 804.7500305175781, "epoch": 0.4353658536585366, "grad_norm": 0.4465596675872803, "kl": 0.032470703125, "learning_rate": 2.0843328913239216e-06, "loss": -0.097, "reward": 0.2083333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 357 }, { "completion_length": 990.1666870117188, "epoch": 0.43658536585365854, "grad_norm": 679.5493774414062, "kl": 4.13946533203125, "learning_rate": 2.0784466867322037e-06, "loss": 0.0965, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 358 }, { "completion_length": 1086.2500610351562, "epoch": 0.4378048780487805, "grad_norm": 0.4612940549850464, "kl": 0.02392578125, "learning_rate": 2.0725500000065715e-06, "loss": 0.0144, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 359 }, { "completion_length": 880.9583435058594, "epoch": 0.43902439024390244, "grad_norm": 0.6914082765579224, "kl": 0.0379638671875, "learning_rate": 2.0666429380019185e-06, "loss": 0.0259, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 360 }, { "completion_length": 758.5000305175781, "epoch": 0.4402439024390244, "grad_norm": 0.5643234252929688, "kl": 0.038818359375, "learning_rate": 2.060725607761153e-06, "loss": -0.0063, "reward": 0.1458333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 361 }, { "completion_length": 842.0833435058594, "epoch": 0.44146341463414634, "grad_norm": 0.24680182337760925, "kl": 0.0428466796875, "learning_rate": 2.0547981165132547e-06, "loss": 0.0014, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 362 }, { "completion_length": 1108.1250610351562, "epoch": 0.4426829268292683, "grad_norm": 0.41155484318733215, "kl": 0.03021240234375, "learning_rate": 2.048860571671332e-06, "loss": -0.028, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 363 }, { "completion_length": 865.8125305175781, "epoch": 0.44390243902439025, "grad_norm": 0.2589362859725952, "kl": 0.0301513671875, "learning_rate": 2.0429130808306767e-06, "loss": 0.0356, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 364 }, { "completion_length": 799.2708435058594, "epoch": 0.4451219512195122, "grad_norm": 0.5328904390335083, "kl": 0.031005859375, "learning_rate": 2.036955751766815e-06, "loss": -0.0409, "reward": 0.2500000111758709, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "step": 365 }, { "completion_length": 863.5000305175781, "epoch": 0.44634146341463415, "grad_norm": 0.5315119624137878, "kl": 0.03106689453125, "learning_rate": 2.030988692433552e-06, "loss": 0.0315, "reward": 0.12500000558793545, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 366 }, { "completion_length": 1099.1875305175781, "epoch": 0.4475609756097561, "grad_norm": 0.3300071358680725, "kl": 0.030029296875, "learning_rate": 2.0250120109610155e-06, "loss": 0.0887, "reward": 0.2083333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 367 }, { "completion_length": 849.4583435058594, "epoch": 0.44878048780487806, "grad_norm": 0.30705785751342773, "kl": 0.029052734375, "learning_rate": 2.019025815653701e-06, "loss": -0.0104, "reward": 0.3333333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 368 }, { "completion_length": 746.2916870117188, "epoch": 0.45, "grad_norm": 0.5189336538314819, "kl": 0.032470703125, "learning_rate": 2.0130302149885033e-06, "loss": -0.0545, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 369 }, { "completion_length": 721.7291870117188, "epoch": 0.45121951219512196, "grad_norm": 0.21197453141212463, "kl": 0.03759765625, "learning_rate": 2.007025317612754e-06, "loss": 0.0083, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 370 }, { "completion_length": 923.8541870117188, "epoch": 0.4524390243902439, "grad_norm": 0.3330663740634918, "kl": 0.0364990234375, "learning_rate": 2.001011232342253e-06, "loss": -0.0072, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 371 }, { "completion_length": 1017.0833435058594, "epoch": 0.45365853658536587, "grad_norm": 0.5465441942214966, "kl": 0.0428466796875, "learning_rate": 1.994988068159294e-06, "loss": 0.0243, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 372 }, { "completion_length": 813.5833740234375, "epoch": 0.4548780487804878, "grad_norm": 0.37892287969589233, "kl": 0.03466796875, "learning_rate": 1.9889559342106926e-06, "loss": 0.0091, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 373 }, { "completion_length": 818.5833740234375, "epoch": 0.4560975609756098, "grad_norm": 0.06201218068599701, "kl": 0.031494140625, "learning_rate": 1.9829149398058068e-06, "loss": 0.0012, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 374 }, { "completion_length": 808.6041870117188, "epoch": 0.4573170731707317, "grad_norm": 0.6525385975837708, "kl": 0.036376953125, "learning_rate": 1.976865194414555e-06, "loss": -0.0442, "reward": 0.1666666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 375 }, { "completion_length": 652.3125305175781, "epoch": 0.4585365853658537, "grad_norm": 0.5023518800735474, "kl": 0.0386962890625, "learning_rate": 1.9708068076654364e-06, "loss": -0.0344, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 376 }, { "completion_length": 729.75, "epoch": 0.45975609756097563, "grad_norm": 0.23177191615104675, "kl": 0.03369140625, "learning_rate": 1.9647398893435394e-06, "loss": 0.0079, "reward": 0.2291666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 377 }, { "completion_length": 752.1666870117188, "epoch": 0.4609756097560976, "grad_norm": 0.4666472375392914, "kl": 0.051513671875, "learning_rate": 1.9586645493885565e-06, "loss": -0.0459, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 378 }, { "completion_length": 664.6875305175781, "epoch": 0.46219512195121953, "grad_norm": 0.5903889536857605, "kl": 0.0306396484375, "learning_rate": 1.9525808978927886e-06, "loss": 0.0618, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 379 }, { "completion_length": 864.8541870117188, "epoch": 0.4634146341463415, "grad_norm": 0.34605127573013306, "kl": 0.02838134765625, "learning_rate": 1.946489045099152e-06, "loss": 0.0032, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 380 }, { "completion_length": 747.9166870117188, "epoch": 0.46463414634146344, "grad_norm": 0.5324747562408447, "kl": 0.032958984375, "learning_rate": 1.94038910139918e-06, "loss": 0.0287, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 381 }, { "completion_length": 593.4375, "epoch": 0.4658536585365854, "grad_norm": 0.550981879234314, "kl": 0.033447265625, "learning_rate": 1.934281177331023e-06, "loss": 0.0041, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 382 }, { "completion_length": 740.3958435058594, "epoch": 0.46707317073170734, "grad_norm": 0.26112014055252075, "kl": 0.03411865234375, "learning_rate": 1.928165383577445e-06, "loss": 0.0041, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 383 }, { "completion_length": 730.7291870117188, "epoch": 0.4682926829268293, "grad_norm": 0.6180046200752258, "kl": 0.030029296875, "learning_rate": 1.9220418309638175e-06, "loss": -0.0243, "reward": 0.1875, "reward_std": 0.18042196333408356, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 384 }, { "completion_length": 766.4375, "epoch": 0.4695121951219512, "grad_norm": 0.6600415706634521, "kl": 0.0411376953125, "learning_rate": 1.915910630456112e-06, "loss": 0.0005, "reward": 0.1666666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 385 }, { "completion_length": 723.0208587646484, "epoch": 0.47073170731707314, "grad_norm": 0.6182783842086792, "kl": 0.03955078125, "learning_rate": 1.909771893158889e-06, "loss": 0.0118, "reward": 0.10416666977107525, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 386 }, { "completion_length": 688.4583435058594, "epoch": 0.4719512195121951, "grad_norm": 0.42049577832221985, "kl": 0.02972412109375, "learning_rate": 1.9036257303132843e-06, "loss": -0.0022, "reward": 0.3750000223517418, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3750000223517418, "rewards/format_reward": 0.0, "step": 387 }, { "completion_length": 921.5625305175781, "epoch": 0.47317073170731705, "grad_norm": 0.39355793595314026, "kl": 0.03167724609375, "learning_rate": 1.8974722532949929e-06, "loss": 0.0195, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 388 }, { "completion_length": 567.2291870117188, "epoch": 0.474390243902439, "grad_norm": 0.5436845421791077, "kl": 0.0390625, "learning_rate": 1.8913115736122519e-06, "loss": 0.0301, "reward": 0.2500000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 389 }, { "completion_length": 614.9791870117188, "epoch": 0.47560975609756095, "grad_norm": 0.5892400741577148, "kl": 0.03302001953125, "learning_rate": 1.8851438029038191e-06, "loss": 0.0559, "reward": 0.2916666865348816, "reward_std": 0.14433755725622177, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.0, "step": 390 }, { "completion_length": 885.6041870117188, "epoch": 0.4768292682926829, "grad_norm": 0.06690337508916855, "kl": 0.032470703125, "learning_rate": 1.8789690529369492e-06, "loss": 0.0011, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 391 }, { "completion_length": 493.85418701171875, "epoch": 0.47804878048780486, "grad_norm": 0.6229822039604187, "kl": 0.034912109375, "learning_rate": 1.8727874356053706e-06, "loss": -0.0013, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 392 }, { "completion_length": 755.2916870117188, "epoch": 0.4792682926829268, "grad_norm": 0.31777453422546387, "kl": 0.036376953125, "learning_rate": 1.8665990629272555e-06, "loss": 0.0094, "reward": 0.0833333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 393 }, { "completion_length": 728.6250305175781, "epoch": 0.48048780487804876, "grad_norm": 0.4183621108531952, "kl": 0.03448486328125, "learning_rate": 1.8604040470431908e-06, "loss": -0.0205, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 394 }, { "completion_length": 554.4583435058594, "epoch": 0.4817073170731707, "grad_norm": 0.5221788287162781, "kl": 0.0399169921875, "learning_rate": 1.8542025002141474e-06, "loss": 0.0101, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 395 }, { "completion_length": 802.2291870117188, "epoch": 0.48292682926829267, "grad_norm": 0.22250708937644958, "kl": 0.03021240234375, "learning_rate": 1.8479945348194423e-06, "loss": 0.0055, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 396 }, { "completion_length": 646.2916870117188, "epoch": 0.4841463414634146, "grad_norm": 0.3303127586841583, "kl": 0.037109375, "learning_rate": 1.8417802633547067e-06, "loss": -0.0063, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 397 }, { "completion_length": 712.6666870117188, "epoch": 0.4853658536585366, "grad_norm": 0.6952998042106628, "kl": 0.042236328125, "learning_rate": 1.8355597984298435e-06, "loss": -0.0255, "reward": 0.14583333395421505, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 398 }, { "completion_length": 680.8541870117188, "epoch": 0.4865853658536585, "grad_norm": 0.6096604466438293, "kl": 0.03057861328125, "learning_rate": 1.8293332527669897e-06, "loss": 0.0025, "reward": 0.1875000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 399 }, { "completion_length": 697.625, "epoch": 0.4878048780487805, "grad_norm": 0.5263100266456604, "kl": 0.0400390625, "learning_rate": 1.823100739198472e-06, "loss": 0.0056, "reward": 0.1875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 400 }, { "completion_length": 717.2916870117188, "epoch": 0.48902439024390243, "grad_norm": 0.2937505841255188, "kl": 0.0421142578125, "learning_rate": 1.816862370664762e-06, "loss": 0.0369, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 401 }, { "completion_length": 737.1250305175781, "epoch": 0.4902439024390244, "grad_norm": 0.5524131059646606, "kl": 0.03460693359375, "learning_rate": 1.8106182602124312e-06, "loss": -0.0016, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 402 }, { "completion_length": 689.3541870117188, "epoch": 0.49146341463414633, "grad_norm": 0.6056103110313416, "kl": 0.0382080078125, "learning_rate": 1.8043685209921002e-06, "loss": 0.0203, "reward": 0.1458333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 403 }, { "completion_length": 821.8958435058594, "epoch": 0.4926829268292683, "grad_norm": 0.770128607749939, "kl": 0.035400390625, "learning_rate": 1.7981132662563906e-06, "loss": 0.0777, "reward": 0.2500000149011612, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 404 }, { "completion_length": 869.4166870117188, "epoch": 0.49390243902439024, "grad_norm": 0.3313486576080322, "kl": 0.03070068359375, "learning_rate": 1.7918526093578702e-06, "loss": -0.0011, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 405 }, { "completion_length": 905.875, "epoch": 0.4951219512195122, "grad_norm": 0.29705655574798584, "kl": 0.0396728515625, "learning_rate": 1.7855866637470027e-06, "loss": -0.0047, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 406 }, { "completion_length": 667.7083740234375, "epoch": 0.49634146341463414, "grad_norm": 0.6838599443435669, "kl": 0.0286865234375, "learning_rate": 1.7793155429700868e-06, "loss": -0.0007, "reward": 0.2083333395421505, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 407 }, { "completion_length": 877.5625, "epoch": 0.4975609756097561, "grad_norm": 0.3647572100162506, "kl": 0.03082275390625, "learning_rate": 1.7730393606672033e-06, "loss": -0.0071, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 408 }, { "completion_length": 876.4791870117188, "epoch": 0.49878048780487805, "grad_norm": 0.46186333894729614, "kl": 0.03302001953125, "learning_rate": 1.7667582305701528e-06, "loss": -0.0742, "reward": 0.3333333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 409 }, { "completion_length": 639.5833740234375, "epoch": 0.5, "grad_norm": 0.5355751514434814, "kl": 0.0338134765625, "learning_rate": 1.7604722665003958e-06, "loss": -0.0021, "reward": 0.2083333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 410 }, { "completion_length": 850.5208740234375, "epoch": 0.501219512195122, "grad_norm": 0.4516288638114929, "kl": 0.033935546875, "learning_rate": 1.7541815823669903e-06, "loss": 0.02, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 411 }, { "completion_length": 691.6250305175781, "epoch": 0.5024390243902439, "grad_norm": 0.4676379859447479, "kl": 0.0311279296875, "learning_rate": 1.7478862921645273e-06, "loss": 0.0353, "reward": 0.14583333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 412 }, { "completion_length": 688.0, "epoch": 0.5036585365853659, "grad_norm": 0.4021396040916443, "kl": 0.03631591796875, "learning_rate": 1.7415865099710657e-06, "loss": 0.0129, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 413 }, { "completion_length": 838.8541870117188, "epoch": 0.5048780487804878, "grad_norm": 0.3340761363506317, "kl": 0.03106689453125, "learning_rate": 1.735282349946064e-06, "loss": 0.0196, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 414 }, { "completion_length": 763.1458740234375, "epoch": 0.5060975609756098, "grad_norm": 0.46428605914115906, "kl": 0.03955078125, "learning_rate": 1.7289739263283118e-06, "loss": 0.0211, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 415 }, { "completion_length": 724.7291870117188, "epoch": 0.5073170731707317, "grad_norm": 0.42527034878730774, "kl": 0.03302001953125, "learning_rate": 1.7226613534338608e-06, "loss": -0.0064, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 416 }, { "completion_length": 772.3541870117188, "epoch": 0.5085365853658537, "grad_norm": 0.16283953189849854, "kl": 0.02685546875, "learning_rate": 1.716344745653952e-06, "loss": 0.0059, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 417 }, { "completion_length": 788.2916870117188, "epoch": 0.5097560975609756, "grad_norm": 0.2448461353778839, "kl": 0.02716064453125, "learning_rate": 1.7100242174529439e-06, "loss": 0.0199, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 418 }, { "completion_length": 701.0625305175781, "epoch": 0.5109756097560976, "grad_norm": 0.544904351234436, "kl": 0.03387451171875, "learning_rate": 1.7036998833662359e-06, "loss": -0.0098, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 419 }, { "completion_length": 799.9166870117188, "epoch": 0.5121951219512195, "grad_norm": 0.06163305044174194, "kl": 0.02587890625, "learning_rate": 1.6973718579981973e-06, "loss": 0.001, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 420 }, { "completion_length": 742.2916870117188, "epoch": 0.5134146341463415, "grad_norm": 0.3775089979171753, "kl": 0.03271484375, "learning_rate": 1.6910402560200854e-06, "loss": -0.0004, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 421 }, { "completion_length": 830.2083435058594, "epoch": 0.5146341463414634, "grad_norm": 0.3336365222930908, "kl": 0.03155517578125, "learning_rate": 1.6847051921679702e-06, "loss": 0.0057, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 422 }, { "completion_length": 676.0416870117188, "epoch": 0.5158536585365854, "grad_norm": 0.493982195854187, "kl": 0.0255126953125, "learning_rate": 1.6783667812406569e-06, "loss": 0.0064, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 423 }, { "completion_length": 816.1041870117188, "epoch": 0.5170731707317073, "grad_norm": 0.3415720462799072, "kl": 0.02813720703125, "learning_rate": 1.672025138097601e-06, "loss": 0.0539, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 424 }, { "completion_length": 791.0625305175781, "epoch": 0.5182926829268293, "grad_norm": 0.756782591342926, "kl": 0.02923583984375, "learning_rate": 1.6656803776568307e-06, "loss": 0.0526, "reward": 0.2708333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 425 }, { "completion_length": 754.1666870117188, "epoch": 0.5195121951219512, "grad_norm": 0.4986019432544708, "kl": 0.0341796875, "learning_rate": 1.6593326148928643e-06, "loss": 0.001, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 426 }, { "completion_length": 776.0416870117188, "epoch": 0.5207317073170732, "grad_norm": 0.1987488865852356, "kl": 0.02716064453125, "learning_rate": 1.652981964834623e-06, "loss": 0.0324, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 427 }, { "completion_length": 732.1041870117188, "epoch": 0.5219512195121951, "grad_norm": 0.05336523428559303, "kl": 0.02783203125, "learning_rate": 1.6466285425633527e-06, "loss": 0.0011, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 428 }, { "completion_length": 686.9791870117188, "epoch": 0.5231707317073171, "grad_norm": 0.5836074948310852, "kl": 0.02978515625, "learning_rate": 1.6402724632105323e-06, "loss": 0.0141, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 429 }, { "completion_length": 660.0625305175781, "epoch": 0.524390243902439, "grad_norm": 0.3314565122127533, "kl": 0.03704833984375, "learning_rate": 1.6339138419557916e-06, "loss": 0.0029, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 430 }, { "completion_length": 806.9791870117188, "epoch": 0.525609756097561, "grad_norm": 0.3738638460636139, "kl": 0.0345458984375, "learning_rate": 1.6275527940248218e-06, "loss": 0.0445, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 431 }, { "completion_length": 1050.1250305175781, "epoch": 0.526829268292683, "grad_norm": 0.4248029589653015, "kl": 0.026611328125, "learning_rate": 1.6211894346872887e-06, "loss": -0.0202, "reward": 0.2083333432674408, "reward_std": 0.10825318098068237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 432 }, { "completion_length": 839.0833435058594, "epoch": 0.5280487804878049, "grad_norm": 0.40769124031066895, "kl": 0.0311279296875, "learning_rate": 1.614823879254744e-06, "loss": -0.0006, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 433 }, { "completion_length": 747.9166870117188, "epoch": 0.5292682926829269, "grad_norm": 0.4294043183326721, "kl": 0.0472412109375, "learning_rate": 1.6084562430785336e-06, "loss": -0.0104, "reward": 0.2083333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 434 }, { "completion_length": 946.7708740234375, "epoch": 0.5304878048780488, "grad_norm": 0.3011494278907776, "kl": 0.034912109375, "learning_rate": 1.6020866415477108e-06, "loss": -0.0333, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 435 }, { "completion_length": 1171.1458740234375, "epoch": 0.5317073170731708, "grad_norm": 0.45685434341430664, "kl": 0.02496337890625, "learning_rate": 1.5957151900869425e-06, "loss": 0.0143, "reward": 0.125, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 436 }, { "completion_length": 731.5, "epoch": 0.5329268292682927, "grad_norm": 0.5969831943511963, "kl": 0.03338623046875, "learning_rate": 1.5893420041544193e-06, "loss": -0.0248, "reward": 0.1458333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 437 }, { "completion_length": 736.6041870117188, "epoch": 0.5341463414634147, "grad_norm": 0.4960964322090149, "kl": 0.02801513671875, "learning_rate": 1.582967199239761e-06, "loss": 0.081, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 438 }, { "completion_length": 662.6875305175781, "epoch": 0.5353658536585366, "grad_norm": 0.2524319291114807, "kl": 0.02606201171875, "learning_rate": 1.5765908908619258e-06, "loss": 0.0336, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 439 }, { "completion_length": 758.0208435058594, "epoch": 0.5365853658536586, "grad_norm": 0.21499498188495636, "kl": 0.0301513671875, "learning_rate": 1.5702131945671182e-06, "loss": -0.0047, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 440 }, { "completion_length": 931.4583435058594, "epoch": 0.5378048780487805, "grad_norm": 0.46516576409339905, "kl": 0.032470703125, "learning_rate": 1.5638342259266904e-06, "loss": -0.0083, "reward": 0.18750000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 441 }, { "completion_length": 762.7708435058594, "epoch": 0.5390243902439025, "grad_norm": 0.6176576614379883, "kl": 0.0413818359375, "learning_rate": 1.5574541005350532e-06, "loss": -0.0412, "reward": 0.1875, "reward_std": 0.10825318098068237, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 442 }, { "completion_length": 845.1666870117188, "epoch": 0.5402439024390244, "grad_norm": 0.36604827642440796, "kl": 0.0323486328125, "learning_rate": 1.5510729340075781e-06, "loss": 0.0028, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 443 }, { "completion_length": 822.5208435058594, "epoch": 0.5414634146341464, "grad_norm": 0.4656050205230713, "kl": 0.031494140625, "learning_rate": 1.544690841978504e-06, "loss": -0.0166, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 444 }, { "completion_length": 863.9791870117188, "epoch": 0.5426829268292683, "grad_norm": 0.5311189293861389, "kl": 0.0374755859375, "learning_rate": 1.5383079400988402e-06, "loss": -0.0338, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 445 }, { "completion_length": 712.3125305175781, "epoch": 0.5439024390243903, "grad_norm": 0.5392478704452515, "kl": 0.02532958984375, "learning_rate": 1.5319243440342713e-06, "loss": -0.0118, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 446 }, { "completion_length": 713.6666870117188, "epoch": 0.5451219512195122, "grad_norm": 0.6092529892921448, "kl": 0.0330810546875, "learning_rate": 1.5255401694630625e-06, "loss": 0.0047, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 447 }, { "completion_length": 767.2083435058594, "epoch": 0.5463414634146342, "grad_norm": 0.20800291001796722, "kl": 0.0302734375, "learning_rate": 1.5191555320739608e-06, "loss": 0.014, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 448 }, { "completion_length": 755.1875305175781, "epoch": 0.5475609756097561, "grad_norm": 0.605426549911499, "kl": 0.0333251953125, "learning_rate": 1.5127705475641014e-06, "loss": -0.0052, "reward": 0.3333333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 449 }, { "completion_length": 782.125, "epoch": 0.5487804878048781, "grad_norm": 0.37740781903266907, "kl": 0.02899169921875, "learning_rate": 1.5063853316369081e-06, "loss": 0.0065, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 450 }, { "completion_length": 802.1666870117188, "epoch": 0.55, "grad_norm": 0.07578609138727188, "kl": 0.02288818359375, "learning_rate": 1.5e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 451 }, { "completion_length": 790.3125, "epoch": 0.551219512195122, "grad_norm": 0.4336966872215271, "kl": 0.02813720703125, "learning_rate": 1.4936146683630921e-06, "loss": 0.0157, "reward": 0.14583333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 452 }, { "completion_length": 895.9375, "epoch": 0.552439024390244, "grad_norm": 0.32502347230911255, "kl": 0.02935791015625, "learning_rate": 1.4872294524358989e-06, "loss": -0.0093, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 453 }, { "completion_length": 1040.1667175292969, "epoch": 0.5536585365853659, "grad_norm": 0.14283445477485657, "kl": 0.028076171875, "learning_rate": 1.4808444679260396e-06, "loss": 0.001, "reward": 0.25, "reward_std": 0.0, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 454 }, { "completion_length": 1108.2500610351562, "epoch": 0.5548780487804879, "grad_norm": 0.30400022864341736, "kl": 0.03131103515625, "learning_rate": 1.4744598305369376e-06, "loss": 0.0327, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 455 }, { "completion_length": 726.125, "epoch": 0.5560975609756098, "grad_norm": 0.09212367236614227, "kl": 0.0345458984375, "learning_rate": 1.4680756559657292e-06, "loss": 0.0013, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 456 }, { "completion_length": 756.4166870117188, "epoch": 0.5573170731707318, "grad_norm": 0.49157455563545227, "kl": 0.03619384765625, "learning_rate": 1.4616920599011603e-06, "loss": 0.0927, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 457 }, { "completion_length": 990.3333435058594, "epoch": 0.5585365853658537, "grad_norm": 0.4339282512664795, "kl": 0.0247802734375, "learning_rate": 1.4553091580214963e-06, "loss": 0.0336, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 458 }, { "completion_length": 806.1666870117188, "epoch": 0.5597560975609757, "grad_norm": 0.5246623754501343, "kl": 0.0361328125, "learning_rate": 1.4489270659924222e-06, "loss": -0.029, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 459 }, { "completion_length": 801.8958435058594, "epoch": 0.5609756097560976, "grad_norm": 0.4816710948944092, "kl": 0.027099609375, "learning_rate": 1.442545899464947e-06, "loss": 0.0141, "reward": 0.2291666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 460 }, { "completion_length": 758.0833740234375, "epoch": 0.5621951219512196, "grad_norm": 0.20983240008354187, "kl": 0.029296875, "learning_rate": 1.4361657740733103e-06, "loss": 0.0403, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 461 }, { "completion_length": 829.5, "epoch": 0.5634146341463414, "grad_norm": 0.4363538324832916, "kl": 0.03021240234375, "learning_rate": 1.429786805432882e-06, "loss": 0.0002, "reward": 0.3125000149011612, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 462 }, { "completion_length": 1017.4166870117188, "epoch": 0.5646341463414634, "grad_norm": 0.0842226967215538, "kl": 0.027587890625, "learning_rate": 1.4234091091380743e-06, "loss": 0.001, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 463 }, { "completion_length": 803.875, "epoch": 0.5658536585365853, "grad_norm": 0.18806934356689453, "kl": 0.02984619140625, "learning_rate": 1.4170328007602395e-06, "loss": -0.0075, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 464 }, { "completion_length": 1089.1041870117188, "epoch": 0.5670731707317073, "grad_norm": 0.19883829355239868, "kl": 0.02545166015625, "learning_rate": 1.4106579958455812e-06, "loss": 0.0119, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 465 }, { "completion_length": 893.5000305175781, "epoch": 0.5682926829268292, "grad_norm": 0.4463866353034973, "kl": 0.0289306640625, "learning_rate": 1.4042848099130574e-06, "loss": 0.0065, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 466 }, { "completion_length": 734.5416870117188, "epoch": 0.5695121951219512, "grad_norm": 0.6711140275001526, "kl": 0.03326416015625, "learning_rate": 1.3979133584522893e-06, "loss": 0.0101, "reward": 0.2083333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 467 }, { "completion_length": 709.7083435058594, "epoch": 0.5707317073170731, "grad_norm": 0.7721737623214722, "kl": 0.02752685546875, "learning_rate": 1.391543756921467e-06, "loss": -0.0311, "reward": 0.16666667722165585, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 468 }, { "completion_length": 895.5625305175781, "epoch": 0.5719512195121951, "grad_norm": 0.05096851661801338, "kl": 0.02825927734375, "learning_rate": 1.3851761207452565e-06, "loss": 0.001, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 469 }, { "completion_length": 853.4583435058594, "epoch": 0.573170731707317, "grad_norm": 0.4231189787387848, "kl": 0.0272216796875, "learning_rate": 1.3788105653127118e-06, "loss": 0.0083, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 470 }, { "completion_length": 651.0833435058594, "epoch": 0.574390243902439, "grad_norm": 0.3461414575576782, "kl": 0.0380859375, "learning_rate": 1.3724472059751785e-06, "loss": 0.0157, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 471 }, { "completion_length": 867.8958435058594, "epoch": 0.5756097560975609, "grad_norm": 0.05793582275509834, "kl": 0.02783203125, "learning_rate": 1.3660861580442087e-06, "loss": 0.001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 472 }, { "completion_length": 875.6875, "epoch": 0.5768292682926829, "grad_norm": 0.5400838851928711, "kl": 0.02728271484375, "learning_rate": 1.3597275367894676e-06, "loss": -0.0105, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 473 }, { "completion_length": 818.2500610351562, "epoch": 0.5780487804878048, "grad_norm": 0.7908319234848022, "kl": 0.03375244140625, "learning_rate": 1.3533714574366473e-06, "loss": 0.0058, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 474 }, { "completion_length": 815.2500305175781, "epoch": 0.5792682926829268, "grad_norm": 0.5779252052307129, "kl": 0.0318603515625, "learning_rate": 1.3470180351653773e-06, "loss": 0.0174, "reward": 0.10416666977107525, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 475 }, { "completion_length": 864.2916870117188, "epoch": 0.5804878048780487, "grad_norm": 0.3415527045726776, "kl": 0.02471923828125, "learning_rate": 1.3406673851071362e-06, "loss": 0.0053, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 476 }, { "completion_length": 960.3333435058594, "epoch": 0.5817073170731707, "grad_norm": 0.29808786511421204, "kl": 0.0283203125, "learning_rate": 1.3343196223431698e-06, "loss": 0.0058, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 477 }, { "completion_length": 946.3333740234375, "epoch": 0.5829268292682926, "grad_norm": 0.35267508029937744, "kl": 0.0269775390625, "learning_rate": 1.3279748619023995e-06, "loss": 0.0228, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 478 }, { "completion_length": 922.4375305175781, "epoch": 0.5841463414634146, "grad_norm": 0.29554396867752075, "kl": 0.02947998046875, "learning_rate": 1.3216332187593434e-06, "loss": -0.0132, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 479 }, { "completion_length": 858.0833435058594, "epoch": 0.5853658536585366, "grad_norm": 0.39898625016212463, "kl": 0.03106689453125, "learning_rate": 1.3152948078320297e-06, "loss": -0.0085, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 480 }, { "completion_length": 581.6041717529297, "epoch": 0.5865853658536585, "grad_norm": 0.7595959305763245, "kl": 0.0352783203125, "learning_rate": 1.3089597439799151e-06, "loss": -0.0163, "reward": 0.3541666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 481 }, { "completion_length": 935.5417175292969, "epoch": 0.5878048780487805, "grad_norm": 0.4653733968734741, "kl": 0.04296875, "learning_rate": 1.3026281420018034e-06, "loss": -0.0067, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 482 }, { "completion_length": 888.1041870117188, "epoch": 0.5890243902439024, "grad_norm": 0.18640004098415375, "kl": 0.034423828125, "learning_rate": 1.2963001166337642e-06, "loss": 0.0006, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 483 }, { "completion_length": 898.3542175292969, "epoch": 0.5902439024390244, "grad_norm": 0.6631487607955933, "kl": 0.02813720703125, "learning_rate": 1.2899757825470568e-06, "loss": -0.0036, "reward": 0.14583333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 484 }, { "completion_length": 723.5208435058594, "epoch": 0.5914634146341463, "grad_norm": 0.36477863788604736, "kl": 0.02996826171875, "learning_rate": 1.283655254346048e-06, "loss": -0.0048, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 485 }, { "completion_length": 887.1875305175781, "epoch": 0.5926829268292683, "grad_norm": 0.4081045389175415, "kl": 0.05609130859375, "learning_rate": 1.2773386465661395e-06, "loss": 0.0024, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 486 }, { "completion_length": 802.3333435058594, "epoch": 0.5939024390243902, "grad_norm": 0.25304004549980164, "kl": 0.028076171875, "learning_rate": 1.2710260736716882e-06, "loss": -0.0011, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 487 }, { "completion_length": 902.0833435058594, "epoch": 0.5951219512195122, "grad_norm": 0.3382227122783661, "kl": 0.02691650390625, "learning_rate": 1.264717650053936e-06, "loss": 0.0269, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 488 }, { "completion_length": 964.2291870117188, "epoch": 0.5963414634146341, "grad_norm": 0.5060334205627441, "kl": 0.02862548828125, "learning_rate": 1.2584134900289346e-06, "loss": -0.0156, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 489 }, { "completion_length": 1043.3958740234375, "epoch": 0.5975609756097561, "grad_norm": 0.2051764875650406, "kl": 0.0284423828125, "learning_rate": 1.2521137078354728e-06, "loss": 0.0004, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 490 }, { "completion_length": 929.3333740234375, "epoch": 0.598780487804878, "grad_norm": 0.4943280518054962, "kl": 0.02301025390625, "learning_rate": 1.2458184176330102e-06, "loss": 0.0281, "reward": 0.1875, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 491 }, { "completion_length": 832.5833740234375, "epoch": 0.6, "grad_norm": 0.11139194667339325, "kl": 0.02972412109375, "learning_rate": 1.2395277334996047e-06, "loss": 0.001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 492 }, { "completion_length": 745.7708435058594, "epoch": 0.6012195121951219, "grad_norm": 0.04622248560190201, "kl": 0.0257568359375, "learning_rate": 1.2332417694298477e-06, "loss": 0.0008, "reward": 0.3125, "reward_std": 0.0, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 493 }, { "completion_length": 866.1458740234375, "epoch": 0.6024390243902439, "grad_norm": 0.06395512074232101, "kl": 0.0565185546875, "learning_rate": 1.2269606393327968e-06, "loss": 0.0012, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 494 }, { "completion_length": 779.8958740234375, "epoch": 0.6036585365853658, "grad_norm": 0.5273105502128601, "kl": 0.03179931640625, "learning_rate": 1.2206844570299133e-06, "loss": -0.112, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 495 }, { "completion_length": 780.1458740234375, "epoch": 0.6048780487804878, "grad_norm": 0.4124651849269867, "kl": 0.025634765625, "learning_rate": 1.2144133362529974e-06, "loss": -0.0126, "reward": 0.2708333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 496 }, { "completion_length": 829.9791870117188, "epoch": 0.6060975609756097, "grad_norm": 0.7791106700897217, "kl": 0.03582763671875, "learning_rate": 1.2081473906421298e-06, "loss": 0.0441, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 497 }, { "completion_length": 690.7500305175781, "epoch": 0.6073170731707317, "grad_norm": 0.5013418793678284, "kl": 0.03131103515625, "learning_rate": 1.20188673374361e-06, "loss": 0.0256, "reward": 0.1875, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 498 }, { "completion_length": 727.9375305175781, "epoch": 0.6085365853658536, "grad_norm": 0.5570080280303955, "kl": 0.02862548828125, "learning_rate": 1.1956314790078998e-06, "loss": -0.0023, "reward": 0.2708333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 499 }, { "completion_length": 867.0416870117188, "epoch": 0.6097560975609756, "grad_norm": 0.04908730089664459, "kl": 0.02886962890625, "learning_rate": 1.189381739787569e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 500 }, { "completion_length": 783.8333740234375, "epoch": 0.6109756097560975, "grad_norm": 0.3778320252895355, "kl": 0.02886962890625, "learning_rate": 1.1831376293352378e-06, "loss": 0.0196, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 501 }, { "completion_length": 792.1250305175781, "epoch": 0.6121951219512195, "grad_norm": 1.6423802375793457, "kl": 0.05072021484375, "learning_rate": 1.176899260801528e-06, "loss": -0.0162, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 502 }, { "completion_length": 828.8333740234375, "epoch": 0.6134146341463415, "grad_norm": 0.5353675484657288, "kl": 0.0289306640625, "learning_rate": 1.1706667472330101e-06, "loss": -0.0059, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 503 }, { "completion_length": 709.5000305175781, "epoch": 0.6146341463414634, "grad_norm": 0.4470565915107727, "kl": 0.026123046875, "learning_rate": 1.1644402015701568e-06, "loss": 0.0265, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 504 }, { "completion_length": 827.3541870117188, "epoch": 0.6158536585365854, "grad_norm": 0.5625233054161072, "kl": 0.0302734375, "learning_rate": 1.158219736645294e-06, "loss": 0.0489, "reward": 0.0833333358168602, "reward_std": 0.14433757960796356, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 505 }, { "completion_length": 803.9166870117188, "epoch": 0.6170731707317073, "grad_norm": 0.3888726532459259, "kl": 0.02508544921875, "learning_rate": 1.152005465180558e-06, "loss": 0.0052, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 506 }, { "completion_length": 900.4791870117188, "epoch": 0.6182926829268293, "grad_norm": 0.3920578956604004, "kl": 0.0260009765625, "learning_rate": 1.145797499785853e-06, "loss": -0.0216, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 507 }, { "completion_length": 784.7291870117188, "epoch": 0.6195121951219512, "grad_norm": 0.4152125418186188, "kl": 0.02838134765625, "learning_rate": 1.1395959529568088e-06, "loss": -0.0235, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 508 }, { "completion_length": 699.4583435058594, "epoch": 0.6207317073170732, "grad_norm": 0.3461558520793915, "kl": 0.0263671875, "learning_rate": 1.1334009370727446e-06, "loss": 0.0797, "reward": 0.2708333358168602, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 509 }, { "completion_length": 560.3125, "epoch": 0.6219512195121951, "grad_norm": 0.6555963754653931, "kl": 0.02825927734375, "learning_rate": 1.127212564394629e-06, "loss": 0.0049, "reward": 0.375, "reward_std": 0.18042196333408356, "rewards/accuracy_reward": 0.375, "rewards/format_reward": 0.0, "step": 510 }, { "completion_length": 892.4166870117188, "epoch": 0.6231707317073171, "grad_norm": 0.4940139055252075, "kl": 0.02618408203125, "learning_rate": 1.1210309470630509e-06, "loss": 0.1071, "reward": 0.1875000111758709, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 511 }, { "completion_length": 854.7083435058594, "epoch": 0.624390243902439, "grad_norm": 0.5197833776473999, "kl": 0.0255126953125, "learning_rate": 1.1148561970961818e-06, "loss": -0.0257, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 512 }, { "completion_length": 963.0833740234375, "epoch": 0.625609756097561, "grad_norm": 0.26763102412223816, "kl": 0.0277099609375, "learning_rate": 1.1086884263877486e-06, "loss": 0.0028, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 513 }, { "completion_length": 795.7917175292969, "epoch": 0.6268292682926829, "grad_norm": 0.04478263109922409, "kl": 0.03985595703125, "learning_rate": 1.1025277467050079e-06, "loss": 0.001, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 514 }, { "completion_length": 962.8125305175781, "epoch": 0.6280487804878049, "grad_norm": 0.3781687915325165, "kl": 0.03082275390625, "learning_rate": 1.0963742696867162e-06, "loss": 0.0034, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 515 }, { "completion_length": 823.6250305175781, "epoch": 0.6292682926829268, "grad_norm": 0.44658133387565613, "kl": 0.031494140625, "learning_rate": 1.0902281068411114e-06, "loss": -0.0129, "reward": 0.2500000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 516 }, { "completion_length": 748.2916870117188, "epoch": 0.6304878048780488, "grad_norm": 0.44513779878616333, "kl": 0.0255126953125, "learning_rate": 1.084089369543888e-06, "loss": 0.0591, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 517 }, { "completion_length": 778.8333740234375, "epoch": 0.6317073170731707, "grad_norm": 0.35178038477897644, "kl": 0.030029296875, "learning_rate": 1.077958169036183e-06, "loss": -0.0142, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 518 }, { "completion_length": 889.2708435058594, "epoch": 0.6329268292682927, "grad_norm": 0.26045531034469604, "kl": 0.02838134765625, "learning_rate": 1.0718346164225556e-06, "loss": 0.0006, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 519 }, { "completion_length": 910.5416870117188, "epoch": 0.6341463414634146, "grad_norm": 0.47047415375709534, "kl": 0.02703857421875, "learning_rate": 1.0657188226689772e-06, "loss": 0.047, "reward": 0.10416666977107525, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 520 }, { "completion_length": 841.9792175292969, "epoch": 0.6353658536585366, "grad_norm": 0.2454436719417572, "kl": 0.02545166015625, "learning_rate": 1.0596108986008203e-06, "loss": 0.0034, "reward": 0.2708333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 521 }, { "completion_length": 791.6250305175781, "epoch": 0.6365853658536585, "grad_norm": 0.08019955456256866, "kl": 0.02874755859375, "learning_rate": 1.0535109549008482e-06, "loss": 0.0011, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 522 }, { "completion_length": 808.9791870117188, "epoch": 0.6378048780487805, "grad_norm": 0.3111408054828644, "kl": 0.03118896484375, "learning_rate": 1.0474191021072117e-06, "loss": -0.0016, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 523 }, { "completion_length": 816.1458435058594, "epoch": 0.6390243902439025, "grad_norm": 0.4471191167831421, "kl": 0.02532958984375, "learning_rate": 1.0413354506114434e-06, "loss": -0.0062, "reward": 0.1041666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 524 }, { "completion_length": 682.3750305175781, "epoch": 0.6402439024390244, "grad_norm": 0.44450777769088745, "kl": 0.025634765625, "learning_rate": 1.0352601106564607e-06, "loss": 0.0312, "reward": 0.20833333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 525 }, { "completion_length": 646.6875305175781, "epoch": 0.6414634146341464, "grad_norm": 0.5126345157623291, "kl": 0.0330810546875, "learning_rate": 1.0291931923345635e-06, "loss": 0.0703, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 526 }, { "completion_length": 702.7916870117188, "epoch": 0.6426829268292683, "grad_norm": 0.5051405429840088, "kl": 0.02252197265625, "learning_rate": 1.0231348055854452e-06, "loss": 0.0099, "reward": 0.2083333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 527 }, { "completion_length": 879.0625, "epoch": 0.6439024390243903, "grad_norm": 0.31973937153816223, "kl": 0.03155517578125, "learning_rate": 1.0170850601941937e-06, "loss": -0.0368, "reward": 0.2291666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 528 }, { "completion_length": 880.9791870117188, "epoch": 0.6451219512195122, "grad_norm": 0.25314292311668396, "kl": 0.0260009765625, "learning_rate": 1.0110440657893074e-06, "loss": -0.008, "reward": 0.3541666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 529 }, { "completion_length": 1073.8750610351562, "epoch": 0.6463414634146342, "grad_norm": 0.4375230669975281, "kl": 0.02862548828125, "learning_rate": 1.0050119318407061e-06, "loss": -0.0044, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 530 }, { "completion_length": 703.3750305175781, "epoch": 0.6475609756097561, "grad_norm": 0.4382186233997345, "kl": 0.0386962890625, "learning_rate": 9.98988767657747e-07, "loss": 0.0117, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 531 }, { "completion_length": 947.1041870117188, "epoch": 0.6487804878048781, "grad_norm": 0.3478910028934479, "kl": 0.02777099609375, "learning_rate": 9.929746823872462e-07, "loss": 0.0117, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 532 }, { "completion_length": 968.8333740234375, "epoch": 0.65, "grad_norm": 0.05612090975046158, "kl": 0.03887939453125, "learning_rate": 9.86969785011497e-07, "loss": 0.001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 533 }, { "completion_length": 907.7500305175781, "epoch": 0.651219512195122, "grad_norm": 0.5268975496292114, "kl": 0.02972412109375, "learning_rate": 9.809741843462994e-07, "loss": 0.0468, "reward": 0.12500000558793545, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 534 }, { "completion_length": 785.0833435058594, "epoch": 0.6524390243902439, "grad_norm": 0.47635316848754883, "kl": 0.02850341796875, "learning_rate": 9.749879890389848e-07, "loss": -0.017, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 535 }, { "completion_length": 847.6875305175781, "epoch": 0.6536585365853659, "grad_norm": 0.18279653787612915, "kl": 0.02557373046875, "learning_rate": 9.690113075664488e-07, "loss": -0.002, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 536 }, { "completion_length": 901.2083740234375, "epoch": 0.6548780487804878, "grad_norm": 0.5397875905036926, "kl": 0.0302734375, "learning_rate": 9.630442482331853e-07, "loss": 0.0856, "reward": 0.18750000558793545, "reward_std": 0.25259073823690414, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 537 }, { "completion_length": 1034.0208435058594, "epoch": 0.6560975609756098, "grad_norm": 0.3381046950817108, "kl": 0.0242919921875, "learning_rate": 9.57086919169323e-07, "loss": 0.042, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 538 }, { "completion_length": 997.8125305175781, "epoch": 0.6573170731707317, "grad_norm": 0.64218670129776, "kl": 0.03057861328125, "learning_rate": 9.511394283286686e-07, "loss": 0.1128, "reward": 0.2083333432674408, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 539 }, { "completion_length": 717.875, "epoch": 0.6585365853658537, "grad_norm": 0.3277949392795563, "kl": 0.02752685546875, "learning_rate": 9.452018834867454e-07, "loss": 0.0327, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 540 }, { "completion_length": 1029.4583740234375, "epoch": 0.6597560975609756, "grad_norm": 0.24999314546585083, "kl": 0.025390625, "learning_rate": 9.392743922388469e-07, "loss": 0.0099, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 541 }, { "completion_length": 912.1458435058594, "epoch": 0.6609756097560976, "grad_norm": 0.1514778882265091, "kl": 0.02947998046875, "learning_rate": 9.333570619980818e-07, "loss": 0.0011, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 542 }, { "completion_length": 840.1458740234375, "epoch": 0.6621951219512195, "grad_norm": 0.3129235804080963, "kl": 0.02838134765625, "learning_rate": 9.27449999993429e-07, "loss": -0.0009, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 543 }, { "completion_length": 928.0000305175781, "epoch": 0.6634146341463415, "grad_norm": 0.4312836229801178, "kl": 0.02886962890625, "learning_rate": 9.215533132677969e-07, "loss": 0.0046, "reward": 0.229166679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 544 }, { "completion_length": 765.4167175292969, "epoch": 0.6646341463414634, "grad_norm": 0.7276366949081421, "kl": 0.02789306640625, "learning_rate": 9.156671086760788e-07, "loss": 0.0027, "reward": 0.2291666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2291666679084301, "rewards/format_reward": 0.0, "step": 545 }, { "completion_length": 906.8958435058594, "epoch": 0.6658536585365854, "grad_norm": 0.4692193269729614, "kl": 0.057373046875, "learning_rate": 9.097914928832228e-07, "loss": -0.084, "reward": 0.2708333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 546 }, { "completion_length": 1031.8333740234375, "epoch": 0.6670731707317074, "grad_norm": 0.21384288370609283, "kl": 0.0313720703125, "learning_rate": 9.039265723622923e-07, "loss": 0.0179, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 547 }, { "completion_length": 984.6875, "epoch": 0.6682926829268293, "grad_norm": 0.32777276635169983, "kl": 0.03143310546875, "learning_rate": 8.980724533925419e-07, "loss": 0.0412, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 548 }, { "completion_length": 822.2291870117188, "epoch": 0.6695121951219513, "grad_norm": 0.06951643526554108, "kl": 0.02813720703125, "learning_rate": 8.922292420574888e-07, "loss": 0.0011, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 549 }, { "completion_length": 1072.8333740234375, "epoch": 0.6707317073170732, "grad_norm": 0.33174851536750793, "kl": 0.03363037109375, "learning_rate": 8.863970442429902e-07, "loss": 0.0145, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 550 }, { "completion_length": 718.5416870117188, "epoch": 0.6719512195121952, "grad_norm": 0.3611091375350952, "kl": 0.0299072265625, "learning_rate": 8.805759656353275e-07, "loss": 0.0043, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 551 }, { "completion_length": 1112.2917175292969, "epoch": 0.6731707317073171, "grad_norm": 0.23453758656978607, "kl": 0.03106689453125, "learning_rate": 8.74766111719286e-07, "loss": 0.0303, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 552 }, { "completion_length": 1011.8333740234375, "epoch": 0.6743902439024391, "grad_norm": 0.4298003613948822, "kl": 0.02471923828125, "learning_rate": 8.689675877762487e-07, "loss": 0.0376, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 553 }, { "completion_length": 880.875, "epoch": 0.675609756097561, "grad_norm": 0.2480362057685852, "kl": 0.02435302734375, "learning_rate": 8.631804988822859e-07, "loss": 0.0226, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 554 }, { "completion_length": 931.6666870117188, "epoch": 0.676829268292683, "grad_norm": 0.45659956336021423, "kl": 0.0318603515625, "learning_rate": 8.574049499062509e-07, "loss": 0.0662, "reward": 0.458333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.458333358168602, "rewards/format_reward": 0.0, "step": 555 }, { "completion_length": 1066.6458435058594, "epoch": 0.6780487804878049, "grad_norm": 0.3029688894748688, "kl": 0.02972412109375, "learning_rate": 8.516410455078793e-07, "loss": 0.0435, "reward": 0.1666666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 556 }, { "completion_length": 1015.1042175292969, "epoch": 0.6792682926829269, "grad_norm": 0.43346521258354187, "kl": 0.026611328125, "learning_rate": 8.458888901358958e-07, "loss": 0.0408, "reward": 0.1666666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 557 }, { "completion_length": 1045.7917175292969, "epoch": 0.6804878048780488, "grad_norm": 0.21469931304454803, "kl": 0.0299072265625, "learning_rate": 8.401485880261151e-07, "loss": 0.0019, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 558 }, { "completion_length": 825.8125305175781, "epoch": 0.6817073170731708, "grad_norm": 0.052236396819353104, "kl": 0.021240234375, "learning_rate": 8.344202431995604e-07, "loss": 0.0008, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 559 }, { "completion_length": 1029.2500610351562, "epoch": 0.6829268292682927, "grad_norm": 0.06884250044822693, "kl": 0.03179931640625, "learning_rate": 8.287039594605737e-07, "loss": 0.0012, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 560 }, { "completion_length": 837.125, "epoch": 0.6841463414634147, "grad_norm": 0.8303191661834717, "kl": 0.0313720703125, "learning_rate": 8.229998403949348e-07, "loss": 0.0064, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 561 }, { "completion_length": 1083.7708740234375, "epoch": 0.6853658536585366, "grad_norm": 0.4762817323207855, "kl": 0.03076171875, "learning_rate": 8.173079893679873e-07, "loss": -0.0835, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 562 }, { "completion_length": 942.0000305175781, "epoch": 0.6865853658536586, "grad_norm": 0.39529234170913696, "kl": 0.02484130859375, "learning_rate": 8.116285095227604e-07, "loss": 0.0101, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 563 }, { "completion_length": 1088.4166870117188, "epoch": 0.6878048780487804, "grad_norm": 0.35131967067718506, "kl": 0.0321044921875, "learning_rate": 8.05961503778108e-07, "loss": -0.0654, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 564 }, { "completion_length": 1007.2291870117188, "epoch": 0.6890243902439024, "grad_norm": 0.12090548872947693, "kl": 0.02606201171875, "learning_rate": 8.003070748268339e-07, "loss": 0.0043, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 565 }, { "completion_length": 940.6250305175781, "epoch": 0.6902439024390243, "grad_norm": 0.33971157670021057, "kl": 0.02880859375, "learning_rate": 7.94665325133837e-07, "loss": -0.0299, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 566 }, { "completion_length": 1149.3958435058594, "epoch": 0.6914634146341463, "grad_norm": 0.5320213437080383, "kl": 0.0594482421875, "learning_rate": 7.890363569342539e-07, "loss": 0.0018, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 567 }, { "completion_length": 644.1041870117188, "epoch": 0.6926829268292682, "grad_norm": 0.6505311727523804, "kl": 0.03271484375, "learning_rate": 7.834202722316054e-07, "loss": -0.0397, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 568 }, { "completion_length": 821.2500305175781, "epoch": 0.6939024390243902, "grad_norm": 0.28424742817878723, "kl": 0.02581787109375, "learning_rate": 7.778171727959482e-07, "loss": -0.0189, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 569 }, { "completion_length": 986.1458740234375, "epoch": 0.6951219512195121, "grad_norm": 0.3112906217575073, "kl": 0.02734375, "learning_rate": 7.722271601620293e-07, "loss": -0.0013, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 570 }, { "completion_length": 873.5833435058594, "epoch": 0.6963414634146341, "grad_norm": 0.04128978028893471, "kl": 0.0260009765625, "learning_rate": 7.6665033562745e-07, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 571 }, { "completion_length": 774.3750305175781, "epoch": 0.697560975609756, "grad_norm": 0.4388665556907654, "kl": 0.0338134765625, "learning_rate": 7.610868002508248e-07, "loss": -0.0205, "reward": 0.2083333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 572 }, { "completion_length": 771.5000305175781, "epoch": 0.698780487804878, "grad_norm": 5.242128372192383, "kl": 0.05743408203125, "learning_rate": 7.555366548499551e-07, "loss": 0.0609, "reward": 0.2916666865348816, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666865348816, "rewards/format_reward": 0.0, "step": 573 }, { "completion_length": 815.7916870117188, "epoch": 0.7, "grad_norm": 0.44563284516334534, "kl": 0.02752685546875, "learning_rate": 7.500000000000003e-07, "loss": 0.0092, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 574 }, { "completion_length": 963.7708740234375, "epoch": 0.7012195121951219, "grad_norm": 0.32968661189079285, "kl": 0.02679443359375, "learning_rate": 7.444769360316534e-07, "loss": 0.0105, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 575 }, { "completion_length": 969.4375, "epoch": 0.7024390243902439, "grad_norm": 0.4815066158771515, "kl": 0.025390625, "learning_rate": 7.389675630293269e-07, "loss": -0.0301, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 576 }, { "completion_length": 816.0000305175781, "epoch": 0.7036585365853658, "grad_norm": 0.2536729574203491, "kl": 0.03240966796875, "learning_rate": 7.334719808293342e-07, "loss": 0.0069, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 577 }, { "completion_length": 830.5833435058594, "epoch": 0.7048780487804878, "grad_norm": 0.3585840165615082, "kl": 0.02716064453125, "learning_rate": 7.279902890180865e-07, "loss": 0.0016, "reward": 0.25000000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25000000558793545, "rewards/format_reward": 0.0, "step": 578 }, { "completion_length": 932.3750305175781, "epoch": 0.7060975609756097, "grad_norm": 0.5187066793441772, "kl": 0.02978515625, "learning_rate": 7.225225869302818e-07, "loss": -0.0782, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 579 }, { "completion_length": 1005.7083740234375, "epoch": 0.7073170731707317, "grad_norm": 0.313052237033844, "kl": 0.0267333984375, "learning_rate": 7.1706897364711e-07, "loss": 0.0132, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 580 }, { "completion_length": 745.5416870117188, "epoch": 0.7085365853658536, "grad_norm": 0.38321879506111145, "kl": 0.03131103515625, "learning_rate": 7.116295479944533e-07, "loss": 0.0082, "reward": 0.16666667722165585, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.16666667722165585, "rewards/format_reward": 0.0, "step": 581 }, { "completion_length": 1140.0000305175781, "epoch": 0.7097560975609756, "grad_norm": 0.6155075430870056, "kl": 0.03070068359375, "learning_rate": 7.062044085410991e-07, "loss": -0.084, "reward": 0.1666666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 582 }, { "completion_length": 850.8958435058594, "epoch": 0.7109756097560975, "grad_norm": 0.4988707900047302, "kl": 0.02691650390625, "learning_rate": 7.007936535969516e-07, "loss": 0.0107, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 583 }, { "completion_length": 1001.2083740234375, "epoch": 0.7121951219512195, "grad_norm": 0.4897194504737854, "kl": 0.03070068359375, "learning_rate": 6.9539738121125e-07, "loss": 0.0243, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 584 }, { "completion_length": 866.8125305175781, "epoch": 0.7134146341463414, "grad_norm": 0.5088192224502563, "kl": 0.03009033203125, "learning_rate": 6.90015689170794e-07, "loss": 0.05, "reward": 0.1458333395421505, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 585 }, { "completion_length": 1086.8958740234375, "epoch": 0.7146341463414634, "grad_norm": 0.391956090927124, "kl": 0.0238037109375, "learning_rate": 6.846486749981684e-07, "loss": 0.0635, "reward": 0.1666666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 586 }, { "completion_length": 1109.1875610351562, "epoch": 0.7158536585365853, "grad_norm": 0.5406737923622131, "kl": 0.03082275390625, "learning_rate": 6.792964359499794e-07, "loss": 0.0022, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 587 }, { "completion_length": 965.4583740234375, "epoch": 0.7170731707317073, "grad_norm": 0.472937673330307, "kl": 0.025146484375, "learning_rate": 6.739590690150903e-07, "loss": 0.027, "reward": 0.3333333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 588 }, { "completion_length": 740.2708435058594, "epoch": 0.7182926829268292, "grad_norm": 0.7443292140960693, "kl": 0.025390625, "learning_rate": 6.686366709128632e-07, "loss": 0.0367, "reward": 0.1458333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 589 }, { "completion_length": 623.6666870117188, "epoch": 0.7195121951219512, "grad_norm": 0.3671242594718933, "kl": 0.027099609375, "learning_rate": 6.633293380914087e-07, "loss": -0.0144, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 590 }, { "completion_length": 743.3541870117188, "epoch": 0.7207317073170731, "grad_norm": 0.4817129969596863, "kl": 0.0281982421875, "learning_rate": 6.580371667258349e-07, "loss": 0.0248, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 591 }, { "completion_length": 914.7708435058594, "epoch": 0.7219512195121951, "grad_norm": 0.6760240197181702, "kl": 0.03094482421875, "learning_rate": 6.527602527165099e-07, "loss": 0.0382, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 592 }, { "completion_length": 1096.8333740234375, "epoch": 0.723170731707317, "grad_norm": 0.40590059757232666, "kl": 0.0230712890625, "learning_rate": 6.474986916873168e-07, "loss": 0.0277, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 593 }, { "completion_length": 762.5625305175781, "epoch": 0.724390243902439, "grad_norm": 0.23105137050151825, "kl": 0.033935546875, "learning_rate": 6.422525789839273e-07, "loss": 0.0089, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 594 }, { "completion_length": 787.4375305175781, "epoch": 0.725609756097561, "grad_norm": 0.48707565665245056, "kl": 0.02789306640625, "learning_rate": 6.370220096720692e-07, "loss": -0.0576, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 595 }, { "completion_length": 706.9166870117188, "epoch": 0.7268292682926829, "grad_norm": 0.042059846222400665, "kl": 0.02459716796875, "learning_rate": 6.318070785358074e-07, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 596 }, { "completion_length": 860.6041870117188, "epoch": 0.7280487804878049, "grad_norm": 0.5042432546615601, "kl": 0.028564453125, "learning_rate": 6.266078800758249e-07, "loss": -0.0065, "reward": 0.2500000149011612, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 597 }, { "completion_length": 914.4583740234375, "epoch": 0.7292682926829268, "grad_norm": 0.3709144592285156, "kl": 0.029052734375, "learning_rate": 6.214245085077078e-07, "loss": 0.0667, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 598 }, { "completion_length": 767.2708740234375, "epoch": 0.7304878048780488, "grad_norm": 0.6649799346923828, "kl": 0.02618408203125, "learning_rate": 6.162570577602433e-07, "loss": -0.0633, "reward": 0.1458333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 599 }, { "completion_length": 687.1041870117188, "epoch": 0.7317073170731707, "grad_norm": 0.3504408299922943, "kl": 0.027099609375, "learning_rate": 6.11105621473712e-07, "loss": 0.0053, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 600 }, { "completion_length": 663.4791870117188, "epoch": 0.7329268292682927, "grad_norm": 0.33722177147865295, "kl": 0.031494140625, "learning_rate": 6.059702929981952e-07, "loss": 0.0021, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 601 }, { "completion_length": 927.5000305175781, "epoch": 0.7341463414634146, "grad_norm": 0.20690011978149414, "kl": 0.029296875, "learning_rate": 6.008511653918821e-07, "loss": 0.0659, "reward": 0.1041666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 602 }, { "completion_length": 945.1458740234375, "epoch": 0.7353658536585366, "grad_norm": 0.3113418519496918, "kl": 0.02423095703125, "learning_rate": 5.957483314193813e-07, "loss": -0.0218, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 603 }, { "completion_length": 1067.1250305175781, "epoch": 0.7365853658536585, "grad_norm": 0.16814640164375305, "kl": 0.02593994140625, "learning_rate": 5.906618835500434e-07, "loss": -0.0261, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 604 }, { "completion_length": 867.8333435058594, "epoch": 0.7378048780487805, "grad_norm": 0.46364933252334595, "kl": 0.0235595703125, "learning_rate": 5.855919139562815e-07, "loss": 0.0116, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 605 }, { "completion_length": 735.8750305175781, "epoch": 0.7390243902439024, "grad_norm": 0.2824901044368744, "kl": 0.025146484375, "learning_rate": 5.805385145119064e-07, "loss": 0.0078, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 606 }, { "completion_length": 648.3541870117188, "epoch": 0.7402439024390244, "grad_norm": 0.21588559448719025, "kl": 0.0252685546875, "learning_rate": 5.755017767904543e-07, "loss": -0.0065, "reward": 0.2708333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 607 }, { "completion_length": 936.625, "epoch": 0.7414634146341463, "grad_norm": 0.5255969166755676, "kl": 0.02447509765625, "learning_rate": 5.704817920635348e-07, "loss": 0.0084, "reward": 0.10416666977107525, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 608 }, { "completion_length": 646.2916870117188, "epoch": 0.7426829268292683, "grad_norm": 0.3593496084213257, "kl": 0.02374267578125, "learning_rate": 5.654786512991705e-07, "loss": -0.0195, "reward": 0.20833333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 609 }, { "completion_length": 762.4791870117188, "epoch": 0.7439024390243902, "grad_norm": 0.5494648814201355, "kl": 0.02764892578125, "learning_rate": 5.60492445160154e-07, "loss": 0.0277, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 610 }, { "completion_length": 615.0625, "epoch": 0.7451219512195122, "grad_norm": 0.4446452558040619, "kl": 0.02777099609375, "learning_rate": 5.555232640024021e-07, "loss": 0.0182, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 611 }, { "completion_length": 1102.7292175292969, "epoch": 0.7463414634146341, "grad_norm": 0.5812187194824219, "kl": 0.02154541015625, "learning_rate": 5.505711978733175e-07, "loss": 0.0239, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 612 }, { "completion_length": 718.625, "epoch": 0.7475609756097561, "grad_norm": 0.34378528594970703, "kl": 0.02716064453125, "learning_rate": 5.456363365101606e-07, "loss": 0.0557, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 613 }, { "completion_length": 839.0625, "epoch": 0.748780487804878, "grad_norm": 0.28123462200164795, "kl": 0.0296630859375, "learning_rate": 5.407187693384191e-07, "loss": -0.0126, "reward": 0.1666666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 614 }, { "completion_length": 853.4166870117188, "epoch": 0.75, "grad_norm": 0.2266978621482849, "kl": 0.02679443359375, "learning_rate": 5.358185854701909e-07, "loss": 0.0021, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 615 }, { "completion_length": 841.8541870117188, "epoch": 0.751219512195122, "grad_norm": 0.3853289783000946, "kl": 0.030029296875, "learning_rate": 5.309358737025682e-07, "loss": -0.006, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 616 }, { "completion_length": 699.5416870117188, "epoch": 0.7524390243902439, "grad_norm": 0.16332949697971344, "kl": 0.02691650390625, "learning_rate": 5.26070722516028e-07, "loss": -0.0019, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 617 }, { "completion_length": 603.0833435058594, "epoch": 0.7536585365853659, "grad_norm": 0.3333573043346405, "kl": 0.02734375, "learning_rate": 5.21223220072828e-07, "loss": 0.0047, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 618 }, { "completion_length": 825.9791870117188, "epoch": 0.7548780487804878, "grad_norm": 0.4847300350666046, "kl": 0.02606201171875, "learning_rate": 5.163934542154106e-07, "loss": -0.0903, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 619 }, { "completion_length": 893.6875, "epoch": 0.7560975609756098, "grad_norm": 0.3289225995540619, "kl": 0.0340576171875, "learning_rate": 5.115815124648103e-07, "loss": -0.023, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 620 }, { "completion_length": 945.3333740234375, "epoch": 0.7573170731707317, "grad_norm": 0.35044562816619873, "kl": 0.0244140625, "learning_rate": 5.067874820190684e-07, "loss": -0.0447, "reward": 0.1458333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 621 }, { "completion_length": 774.5625305175781, "epoch": 0.7585365853658537, "grad_norm": 0.23236961662769318, "kl": 0.02313232421875, "learning_rate": 5.020114497516521e-07, "loss": 0.0038, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 622 }, { "completion_length": 845.3541870117188, "epoch": 0.7597560975609756, "grad_norm": 0.5932947397232056, "kl": 0.02593994140625, "learning_rate": 4.972535022098795e-07, "loss": -0.0275, "reward": 0.3125000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 623 }, { "completion_length": 858.6875305175781, "epoch": 0.7609756097560976, "grad_norm": 0.04889252781867981, "kl": 0.02398681640625, "learning_rate": 4.925137256133533e-07, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 624 }, { "completion_length": 679.8333740234375, "epoch": 0.7621951219512195, "grad_norm": 0.5832393169403076, "kl": 0.02545166015625, "learning_rate": 4.877922058523971e-07, "loss": 0.022, "reward": 0.2083333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 625 }, { "completion_length": 909.9792175292969, "epoch": 0.7634146341463415, "grad_norm": 0.5005730986595154, "kl": 0.0291748046875, "learning_rate": 4.830890284864985e-07, "loss": 0.005, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 626 }, { "completion_length": 749.4583740234375, "epoch": 0.7646341463414634, "grad_norm": 0.39322492480278015, "kl": 0.0283203125, "learning_rate": 4.784042787427605e-07, "loss": -0.0427, "reward": 0.2083333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 627 }, { "completion_length": 819.6250305175781, "epoch": 0.7658536585365854, "grad_norm": 0.3320612609386444, "kl": 0.0233154296875, "learning_rate": 4.7373804151435456e-07, "loss": -0.0096, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 628 }, { "completion_length": 822.9791870117188, "epoch": 0.7670731707317073, "grad_norm": 0.23255078494548798, "kl": 0.02508544921875, "learning_rate": 4.6909040135898463e-07, "loss": 0.0098, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 629 }, { "completion_length": 744.3958740234375, "epoch": 0.7682926829268293, "grad_norm": 0.3943890333175659, "kl": 0.02325439453125, "learning_rate": 4.6446144249735345e-07, "loss": 0.0175, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 630 }, { "completion_length": 929.8333435058594, "epoch": 0.7695121951219512, "grad_norm": 0.2944657802581787, "kl": 0.02777099609375, "learning_rate": 4.598512488116376e-07, "loss": 0.003, "reward": 0.25, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 631 }, { "completion_length": 699.0416870117188, "epoch": 0.7707317073170732, "grad_norm": 0.5173816084861755, "kl": 0.02801513671875, "learning_rate": 4.552599038439651e-07, "loss": 0.0126, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 632 }, { "completion_length": 850.4166870117188, "epoch": 0.7719512195121951, "grad_norm": 0.46213850378990173, "kl": 0.0316162109375, "learning_rate": 4.506874907949034e-07, "loss": 0.0377, "reward": 0.0625, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 633 }, { "completion_length": 648.7500305175781, "epoch": 0.7731707317073171, "grad_norm": 0.3204668164253235, "kl": 0.02276611328125, "learning_rate": 4.461340925219522e-07, "loss": -0.0045, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 634 }, { "completion_length": 1001.9375305175781, "epoch": 0.774390243902439, "grad_norm": 0.1694246381521225, "kl": 0.02435302734375, "learning_rate": 4.4159979153804064e-07, "loss": -0.0036, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 635 }, { "completion_length": 636.1875, "epoch": 0.775609756097561, "grad_norm": 0.518161416053772, "kl": 0.02294921875, "learning_rate": 4.3708467001003305e-07, "loss": 0.0107, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 636 }, { "completion_length": 979.5208740234375, "epoch": 0.776829268292683, "grad_norm": 0.22282478213310242, "kl": 0.024871826171875, "learning_rate": 4.3258880975723777e-07, "loss": 0.0294, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 637 }, { "completion_length": 995.0625305175781, "epoch": 0.7780487804878049, "grad_norm": 0.21094514429569244, "kl": 0.02630615234375, "learning_rate": 4.2811229224992807e-07, "loss": 0.0009, "reward": 0.2291666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 638 }, { "completion_length": 774.4375305175781, "epoch": 0.7792682926829269, "grad_norm": 0.5504517555236816, "kl": 0.026123046875, "learning_rate": 4.2365519860786316e-07, "loss": 0.0057, "reward": 0.1458333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 639 }, { "completion_length": 861.0625, "epoch": 0.7804878048780488, "grad_norm": 0.11976215988397598, "kl": 0.0213623046875, "learning_rate": 4.192176095988196e-07, "loss": 0.002, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 640 }, { "completion_length": 867.6875305175781, "epoch": 0.7817073170731708, "grad_norm": 0.31083089113235474, "kl": 0.0318603515625, "learning_rate": 4.147996056371258e-07, "loss": 0.006, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 641 }, { "completion_length": 666.8333435058594, "epoch": 0.7829268292682927, "grad_norm": 0.12398859858512878, "kl": 0.02935791015625, "learning_rate": 4.1040126678220656e-07, "loss": 0.001, "reward": 0.125, "reward_std": 0.0, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 642 }, { "completion_length": 733.7291870117188, "epoch": 0.7841463414634147, "grad_norm": 0.35403457283973694, "kl": 0.0262451171875, "learning_rate": 4.060226727371327e-07, "loss": -0.0117, "reward": 0.3125000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.3125000149011612, "rewards/format_reward": 0.0, "step": 643 }, { "completion_length": 926.9791870117188, "epoch": 0.7853658536585366, "grad_norm": 0.22392979264259338, "kl": 0.02423095703125, "learning_rate": 4.0166390284717475e-07, "loss": -0.0329, "reward": 0.1666666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 644 }, { "completion_length": 881.4583740234375, "epoch": 0.7865853658536586, "grad_norm": 0.27586719393730164, "kl": 0.02484130859375, "learning_rate": 3.973250360983677e-07, "loss": 0.0033, "reward": 0.1041666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 645 }, { "completion_length": 817.9375305175781, "epoch": 0.7878048780487805, "grad_norm": 0.366715669631958, "kl": 0.0218505859375, "learning_rate": 3.930061511160762e-07, "loss": -0.0048, "reward": 0.1458333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 646 }, { "completion_length": 749.5625305175781, "epoch": 0.7890243902439025, "grad_norm": 0.3225473165512085, "kl": 0.0262451171875, "learning_rate": 3.8870732616357364e-07, "loss": 0.0126, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 647 }, { "completion_length": 922.1041870117188, "epoch": 0.7902439024390244, "grad_norm": 0.49840256571769714, "kl": 0.02642822265625, "learning_rate": 3.8442863914062065e-07, "loss": -0.0015, "reward": 0.1041666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 648 }, { "completion_length": 629.3541870117188, "epoch": 0.7914634146341464, "grad_norm": 0.5496554970741272, "kl": 0.02532958984375, "learning_rate": 3.8017016758205597e-07, "loss": -0.0105, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 649 }, { "completion_length": 666.8958740234375, "epoch": 0.7926829268292683, "grad_norm": 0.45331189036369324, "kl": 0.031982421875, "learning_rate": 3.759319886563905e-07, "loss": -0.0191, "reward": 0.25, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.25, "rewards/format_reward": 0.0, "step": 650 }, { "completion_length": 1043.6250610351562, "epoch": 0.7939024390243903, "grad_norm": 0.46646979451179504, "kl": 0.024169921875, "learning_rate": 3.7171417916440714e-07, "loss": 0.1326, "reward": 0.1250000037252903, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 651 }, { "completion_length": 1071.1875610351562, "epoch": 0.7951219512195122, "grad_norm": 0.3821835517883301, "kl": 0.022705078125, "learning_rate": 3.6751681553777236e-07, "loss": 0.0294, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 652 }, { "completion_length": 844.9791870117188, "epoch": 0.7963414634146342, "grad_norm": 0.30517253279685974, "kl": 0.0240478515625, "learning_rate": 3.633399738376491e-07, "loss": 0.0046, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 653 }, { "completion_length": 838.4792175292969, "epoch": 0.7975609756097561, "grad_norm": 0.5359827876091003, "kl": 0.03106689453125, "learning_rate": 3.5918372975331933e-07, "loss": 0.0247, "reward": 0.229166679084301, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 654 }, { "completion_length": 1229.2500610351562, "epoch": 0.7987804878048781, "grad_norm": 0.46620362997055054, "kl": 0.0257568359375, "learning_rate": 3.5504815860081056e-07, "loss": -0.0116, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 655 }, { "completion_length": 768.3125, "epoch": 0.8, "grad_norm": 0.5677731037139893, "kl": 0.03173828125, "learning_rate": 3.5093333532153313e-07, "loss": 0.0289, "reward": 0.2708333432674408, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 656 }, { "completion_length": 994.2916870117188, "epoch": 0.801219512195122, "grad_norm": 0.13922348618507385, "kl": 0.0245361328125, "learning_rate": 3.468393344809222e-07, "loss": 0.0132, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 657 }, { "completion_length": 898.0208435058594, "epoch": 0.802439024390244, "grad_norm": 0.24220433831214905, "kl": 0.02447509765625, "learning_rate": 3.4276623026708556e-07, "loss": 0.0095, "reward": 0.2708333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 658 }, { "completion_length": 786.2500305175781, "epoch": 0.8036585365853659, "grad_norm": 0.34243243932724, "kl": 0.02618408203125, "learning_rate": 3.3871409648945955e-07, "loss": 0.0175, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 659 }, { "completion_length": 809.3333435058594, "epoch": 0.8048780487804879, "grad_norm": 0.45875081419944763, "kl": 0.02392578125, "learning_rate": 3.346830065774706e-07, "loss": 0.0062, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 660 }, { "completion_length": 864.7083740234375, "epoch": 0.8060975609756098, "grad_norm": 0.3514421582221985, "kl": 0.0238037109375, "learning_rate": 3.306730335792075e-07, "loss": -0.0071, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 661 }, { "completion_length": 615.25, "epoch": 0.8073170731707318, "grad_norm": 0.5857967734336853, "kl": 0.02655029296875, "learning_rate": 3.266842501600934e-07, "loss": -0.0196, "reward": 0.1875, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 662 }, { "completion_length": 780.3333740234375, "epoch": 0.8085365853658537, "grad_norm": 0.5814476013183594, "kl": 0.02825927734375, "learning_rate": 3.2271672860157324e-07, "loss": 0.0054, "reward": 0.1875, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 663 }, { "completion_length": 860.0416870117188, "epoch": 0.8097560975609757, "grad_norm": 0.25556638836860657, "kl": 0.0343017578125, "learning_rate": 3.187705407998018e-07, "loss": 0.0115, "reward": 0.06250000186264515, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 664 }, { "completion_length": 735.75, "epoch": 0.8109756097560976, "grad_norm": 0.5753107666969299, "kl": 0.026123046875, "learning_rate": 3.148457582643398e-07, "loss": 0.0642, "reward": 0.2500000074505806, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 665 }, { "completion_length": 1075.9375, "epoch": 0.8121951219512196, "grad_norm": 0.04850023239850998, "kl": 0.0252685546875, "learning_rate": 3.1094245211686106e-07, "loss": 0.0008, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 666 }, { "completion_length": 955.8750610351562, "epoch": 0.8134146341463414, "grad_norm": 0.0982297733426094, "kl": 0.024169921875, "learning_rate": 3.070606930898602e-07, "loss": 0.0046, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 667 }, { "completion_length": 863.0416870117188, "epoch": 0.8146341463414634, "grad_norm": 0.4044858515262604, "kl": 0.0328369140625, "learning_rate": 3.032005515253751e-07, "loss": 0.0122, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 668 }, { "completion_length": 818.6250305175781, "epoch": 0.8158536585365853, "grad_norm": 0.38262733817100525, "kl": 0.02764892578125, "learning_rate": 2.9936209737370727e-07, "loss": -0.0002, "reward": 0.1458333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 669 }, { "completion_length": 820.3958740234375, "epoch": 0.8170731707317073, "grad_norm": 0.44554057717323303, "kl": 0.02392578125, "learning_rate": 2.955454001921588e-07, "loss": 0.0221, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 670 }, { "completion_length": 706.2083435058594, "epoch": 0.8182926829268292, "grad_norm": 0.5388452410697937, "kl": 0.0235595703125, "learning_rate": 2.917505291437683e-07, "loss": 0.0026, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 671 }, { "completion_length": 759.5000305175781, "epoch": 0.8195121951219512, "grad_norm": 0.24352578818798065, "kl": 0.080078125, "learning_rate": 2.879775529960603e-07, "loss": 0.0065, "reward": 0.2500000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 672 }, { "completion_length": 817.3958435058594, "epoch": 0.8207317073170731, "grad_norm": 0.6730024218559265, "kl": 0.02984619140625, "learning_rate": 2.842265401197982e-07, "loss": 0.01, "reward": 0.2916666679084301, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666679084301, "rewards/format_reward": 0.0, "step": 673 }, { "completion_length": 978.3125, "epoch": 0.8219512195121951, "grad_norm": 0.4777490794658661, "kl": 0.02362060546875, "learning_rate": 2.8049755848774337e-07, "loss": -0.0511, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 674 }, { "completion_length": 830.3541870117188, "epoch": 0.823170731707317, "grad_norm": 0.44030094146728516, "kl": 0.03741455078125, "learning_rate": 2.7679067567342766e-07, "loss": -0.0183, "reward": 0.2916666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 675 }, { "completion_length": 970.1666870117188, "epoch": 0.824390243902439, "grad_norm": 0.43740084767341614, "kl": 0.02618408203125, "learning_rate": 2.7310595884992354e-07, "loss": 0.0676, "reward": 0.125, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 676 }, { "completion_length": 720.5000305175781, "epoch": 0.8256097560975609, "grad_norm": 0.5037770867347717, "kl": 0.02215576171875, "learning_rate": 2.6944347478863226e-07, "loss": 0.0005, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 677 }, { "completion_length": 679.2916870117188, "epoch": 0.8268292682926829, "grad_norm": 0.5386813282966614, "kl": 0.023681640625, "learning_rate": 2.658032898580702e-07, "loss": 0.0202, "reward": 0.2500000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 678 }, { "completion_length": 824.7083435058594, "epoch": 0.8280487804878048, "grad_norm": 0.5326714515686035, "kl": 0.02606201171875, "learning_rate": 2.621854700226663e-07, "loss": 0.0196, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 679 }, { "completion_length": 772.2916870117188, "epoch": 0.8292682926829268, "grad_norm": 0.36251839995384216, "kl": 0.02960205078125, "learning_rate": 2.5859008084156986e-07, "loss": 0.0207, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 680 }, { "completion_length": 1063.3542175292969, "epoch": 0.8304878048780487, "grad_norm": 0.3171287477016449, "kl": 0.0240478515625, "learning_rate": 2.5501718746745766e-07, "loss": -0.016, "reward": 0.1041666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 681 }, { "completion_length": 958.7291870117188, "epoch": 0.8317073170731707, "grad_norm": 0.04372232034802437, "kl": 0.025390625, "learning_rate": 2.514668546453592e-07, "loss": 0.0009, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 682 }, { "completion_length": 701.7083435058594, "epoch": 0.8329268292682926, "grad_norm": 0.5557974576950073, "kl": 0.02490234375, "learning_rate": 2.4793914671147745e-07, "loss": -0.0015, "reward": 0.2500000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 683 }, { "completion_length": 828.6875305175781, "epoch": 0.8341463414634146, "grad_norm": 0.5050874352455139, "kl": 0.0203857421875, "learning_rate": 2.4443412759202745e-07, "loss": -0.0188, "reward": 0.2708333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333358168602, "rewards/format_reward": 0.0, "step": 684 }, { "completion_length": 703.7708435058594, "epoch": 0.8353658536585366, "grad_norm": 0.5317684412002563, "kl": 0.032470703125, "learning_rate": 2.4095186080207505e-07, "loss": -0.0035, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 685 }, { "completion_length": 779.8541870117188, "epoch": 0.8365853658536585, "grad_norm": 0.4637664556503296, "kl": 0.029296875, "learning_rate": 2.3749240944438845e-07, "loss": 0.023, "reward": 0.1458333358168602, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 686 }, { "completion_length": 781.6875305175781, "epoch": 0.8378048780487805, "grad_norm": 0.4552900493144989, "kl": 0.02520751953125, "learning_rate": 2.3405583620829268e-07, "loss": 0.0113, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 687 }, { "completion_length": 861.0625305175781, "epoch": 0.8390243902439024, "grad_norm": 0.5198604464530945, "kl": 0.02117919921875, "learning_rate": 2.3064220336853398e-07, "loss": -0.0567, "reward": 0.3541666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 688 }, { "completion_length": 784.625, "epoch": 0.8402439024390244, "grad_norm": 0.37156882882118225, "kl": 0.0289306640625, "learning_rate": 2.272515727841527e-07, "loss": -0.0117, "reward": 0.2500000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 689 }, { "completion_length": 993.1667175292969, "epoch": 0.8414634146341463, "grad_norm": 0.42797571420669556, "kl": 0.0313720703125, "learning_rate": 2.2388400589735985e-07, "loss": 0.0018, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 690 }, { "completion_length": 807.8333435058594, "epoch": 0.8426829268292683, "grad_norm": 0.3258882164955139, "kl": 0.0267333984375, "learning_rate": 2.205395637324264e-07, "loss": -0.0123, "reward": 0.20833333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 691 }, { "completion_length": 652.2291870117188, "epoch": 0.8439024390243902, "grad_norm": 0.5457414984703064, "kl": 0.029541015625, "learning_rate": 2.1721830689457583e-07, "loss": 0.0421, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 692 }, { "completion_length": 672.4166870117188, "epoch": 0.8451219512195122, "grad_norm": 0.4368482828140259, "kl": 0.02880859375, "learning_rate": 2.1392029556888576e-07, "loss": 0.0331, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 693 }, { "completion_length": 723.6458435058594, "epoch": 0.8463414634146341, "grad_norm": 0.41581639647483826, "kl": 0.02862548828125, "learning_rate": 2.1064558951919854e-07, "loss": 0.0154, "reward": 0.3333333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.3333333432674408, "rewards/format_reward": 0.0, "step": 694 }, { "completion_length": 751.6875, "epoch": 0.8475609756097561, "grad_norm": 0.2076808363199234, "kl": 0.0267333984375, "learning_rate": 2.0739424808703638e-07, "loss": -0.0015, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 695 }, { "completion_length": 965.2708435058594, "epoch": 0.848780487804878, "grad_norm": 0.1890517622232437, "kl": 0.085205078125, "learning_rate": 2.0416633019052882e-07, "loss": -0.0136, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 696 }, { "completion_length": 715.3333435058594, "epoch": 0.85, "grad_norm": 0.3903053402900696, "kl": 0.0257568359375, "learning_rate": 2.0096189432334195e-07, "loss": -0.0048, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 697 }, { "completion_length": 1027.1250305175781, "epoch": 0.8512195121951219, "grad_norm": 0.1765126883983612, "kl": 0.0223388671875, "learning_rate": 1.9778099855362085e-07, "loss": -0.0027, "reward": 0.1666666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 698 }, { "completion_length": 798.6458740234375, "epoch": 0.8524390243902439, "grad_norm": 0.5328000783920288, "kl": 0.02410888671875, "learning_rate": 1.9462370052293544e-07, "loss": 0.005, "reward": 0.1666666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 699 }, { "completion_length": 913.8750305175781, "epoch": 0.8536585365853658, "grad_norm": 0.8566571474075317, "kl": 0.0328369140625, "learning_rate": 1.9149005744523757e-07, "loss": 0.0011, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 700 }, { "completion_length": 729.0833435058594, "epoch": 0.8548780487804878, "grad_norm": 0.45158228278160095, "kl": 0.030029296875, "learning_rate": 1.8838012610582356e-07, "loss": 0.0429, "reward": 0.1458333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 701 }, { "completion_length": 836.5416870117188, "epoch": 0.8560975609756097, "grad_norm": 0.3114115595817566, "kl": 0.0238037109375, "learning_rate": 1.852939628603046e-07, "loss": -0.0105, "reward": 0.1041666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 702 }, { "completion_length": 730.625, "epoch": 0.8573170731707317, "grad_norm": 0.3165081739425659, "kl": 0.02349853515625, "learning_rate": 1.822316236335867e-07, "loss": -0.0146, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 703 }, { "completion_length": 1010.5416870117188, "epoch": 0.8585365853658536, "grad_norm": 0.25261008739471436, "kl": 0.0235595703125, "learning_rate": 1.7919316391885593e-07, "loss": 0.0463, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 704 }, { "completion_length": 785.5625305175781, "epoch": 0.8597560975609756, "grad_norm": 0.3985195755958557, "kl": 0.0279541015625, "learning_rate": 1.761786387765743e-07, "loss": -0.0239, "reward": 0.2500000149011612, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 705 }, { "completion_length": 849.5, "epoch": 0.8609756097560975, "grad_norm": 0.37897011637687683, "kl": 0.024169921875, "learning_rate": 1.731881028334808e-07, "loss": 0.0273, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 706 }, { "completion_length": 1048.3125, "epoch": 0.8621951219512195, "grad_norm": 0.1622416377067566, "kl": 0.02752685546875, "learning_rate": 1.7022161028160244e-07, "loss": 0.0162, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 707 }, { "completion_length": 802.9166870117188, "epoch": 0.8634146341463415, "grad_norm": 0.4028719365596771, "kl": 0.0225830078125, "learning_rate": 1.6727921487727095e-07, "loss": 0.0212, "reward": 0.2916666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 708 }, { "completion_length": 723.5416870117188, "epoch": 0.8646341463414634, "grad_norm": 0.2419525682926178, "kl": 0.02032470703125, "learning_rate": 1.64360969940149e-07, "loss": -0.0051, "reward": 0.20833333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 709 }, { "completion_length": 1010.3125, "epoch": 0.8658536585365854, "grad_norm": 0.04453768953680992, "kl": 0.0252685546875, "learning_rate": 1.6146692835226669e-07, "loss": 0.0009, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 710 }, { "completion_length": 1053.875, "epoch": 0.8670731707317073, "grad_norm": 0.3233415186405182, "kl": 0.0244140625, "learning_rate": 1.5859714255705843e-07, "loss": 0.0632, "reward": 0.2083333395421505, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 711 }, { "completion_length": 745.6041870117188, "epoch": 0.8682926829268293, "grad_norm": 0.3168937861919403, "kl": 0.0281982421875, "learning_rate": 1.5575166455841677e-07, "loss": 0.048, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 712 }, { "completion_length": 700.4166870117188, "epoch": 0.8695121951219512, "grad_norm": 0.4607069194316864, "kl": 0.02447509765625, "learning_rate": 1.5293054591974726e-07, "loss": -0.0158, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 713 }, { "completion_length": 1071.3125305175781, "epoch": 0.8707317073170732, "grad_norm": 0.27966293692588806, "kl": 0.020263671875, "learning_rate": 1.501338377630362e-07, "loss": 0.0557, "reward": 0.1875, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 714 }, { "completion_length": 937.6458435058594, "epoch": 0.8719512195121951, "grad_norm": 0.27274319529533386, "kl": 0.04486083984375, "learning_rate": 1.473615907679229e-07, "loss": 0.0042, "reward": 0.125, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 715 }, { "completion_length": 880.2500305175781, "epoch": 0.8731707317073171, "grad_norm": 0.3870413899421692, "kl": 0.024169921875, "learning_rate": 1.446138551707814e-07, "loss": -0.0014, "reward": 0.2083333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333395421505, "rewards/format_reward": 0.0, "step": 716 }, { "completion_length": 1127.7916870117188, "epoch": 0.874390243902439, "grad_norm": 0.1338748186826706, "kl": 0.0244140625, "learning_rate": 1.4189068076381078e-07, "loss": 0.0268, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 717 }, { "completion_length": 919.7708435058594, "epoch": 0.875609756097561, "grad_norm": 0.27817457914352417, "kl": 0.020477294921875, "learning_rate": 1.3919211689413207e-07, "loss": 0.0074, "reward": 0.2708333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 718 }, { "completion_length": 723.7291870117188, "epoch": 0.8768292682926829, "grad_norm": 0.06823945790529251, "kl": 0.02557373046875, "learning_rate": 1.365182124628949e-07, "loss": 0.001, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 719 }, { "completion_length": 1069.0000610351562, "epoch": 0.8780487804878049, "grad_norm": 0.13704067468643188, "kl": 0.0267333984375, "learning_rate": 1.3386901592439071e-07, "loss": 0.0003, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 720 }, { "completion_length": 632.8125305175781, "epoch": 0.8792682926829268, "grad_norm": 0.2622615098953247, "kl": 0.0281982421875, "learning_rate": 1.3124457528517503e-07, "loss": 0.0065, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 721 }, { "completion_length": 881.5416870117188, "epoch": 0.8804878048780488, "grad_norm": 0.24476896226406097, "kl": 0.02850341796875, "learning_rate": 1.2864493810319676e-07, "loss": 0.0161, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 722 }, { "completion_length": 832.8750305175781, "epoch": 0.8817073170731707, "grad_norm": 0.41959550976753235, "kl": 0.02484130859375, "learning_rate": 1.260701514869379e-07, "loss": 0.0916, "reward": 0.2708333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 723 }, { "completion_length": 1051.7083435058594, "epoch": 0.8829268292682927, "grad_norm": 0.37093260884284973, "kl": 0.0228271484375, "learning_rate": 1.2352026209455808e-07, "loss": -0.0032, "reward": 0.06250000186264515, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.06250000186264515, "rewards/format_reward": 0.0, "step": 724 }, { "completion_length": 1032.4791870117188, "epoch": 0.8841463414634146, "grad_norm": 0.4205034673213959, "kl": 0.02294921875, "learning_rate": 1.209953161330507e-07, "loss": 0.013, "reward": 0.2500000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000111758709, "rewards/format_reward": 0.0, "step": 725 }, { "completion_length": 672.6458435058594, "epoch": 0.8853658536585366, "grad_norm": 0.32758957147598267, "kl": 0.02978515625, "learning_rate": 1.1849535935740474e-07, "loss": 0.0171, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 726 }, { "completion_length": 1033.6875305175781, "epoch": 0.8865853658536585, "grad_norm": 0.2726954221725464, "kl": 0.0230712890625, "learning_rate": 1.1602043706977538e-07, "loss": 0.0574, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 727 }, { "completion_length": 668.8958435058594, "epoch": 0.8878048780487805, "grad_norm": 0.4613305628299713, "kl": 0.02630615234375, "learning_rate": 1.1357059411866355e-07, "loss": 0.0132, "reward": 0.2500000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2500000074505806, "rewards/format_reward": 0.0, "step": 728 }, { "completion_length": 1183.0416870117188, "epoch": 0.8890243902439025, "grad_norm": 0.3565730154514313, "kl": 0.0208740234375, "learning_rate": 1.1114587489810352e-07, "loss": 0.0297, "reward": 0.18750000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 729 }, { "completion_length": 764.0833435058594, "epoch": 0.8902439024390244, "grad_norm": 0.38788408041000366, "kl": 0.02801513671875, "learning_rate": 1.0874632334685808e-07, "loss": 0.0557, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 730 }, { "completion_length": 773.1250305175781, "epoch": 0.8914634146341464, "grad_norm": 0.16996777057647705, "kl": 0.02484130859375, "learning_rate": 1.0637198294762152e-07, "loss": 0.0126, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 731 }, { "completion_length": 1004.3958435058594, "epoch": 0.8926829268292683, "grad_norm": 0.27469712495803833, "kl": 0.0269775390625, "learning_rate": 1.0402289672623272e-07, "loss": 0.0084, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 732 }, { "completion_length": 1066.2291870117188, "epoch": 0.8939024390243903, "grad_norm": 0.09178400784730911, "kl": 0.02496337890625, "learning_rate": 1.0169910725089548e-07, "loss": 0.0009, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 733 }, { "completion_length": 953.7708740234375, "epoch": 0.8951219512195122, "grad_norm": 0.2735711634159088, "kl": 0.023681640625, "learning_rate": 9.940065663140663e-08, "loss": 0.0439, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 734 }, { "completion_length": 837.6042175292969, "epoch": 0.8963414634146342, "grad_norm": 0.47646215558052063, "kl": 0.02392578125, "learning_rate": 9.71275865183936e-08, "loss": 0.0015, "reward": 0.2083333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 735 }, { "completion_length": 929.4166870117188, "epoch": 0.8975609756097561, "grad_norm": 0.43811649084091187, "kl": 0.02923583984375, "learning_rate": 9.487993810255823e-08, "loss": 0.0975, "reward": 0.1875, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 736 }, { "completion_length": 994.0833740234375, "epoch": 0.8987804878048781, "grad_norm": 1.1990416049957275, "kl": 0.0303955078125, "learning_rate": 9.265775211393224e-08, "loss": -0.0442, "reward": 0.1875000074505806, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 737 }, { "completion_length": 819.0000305175781, "epoch": 0.9, "grad_norm": 0.17224682867527008, "kl": 0.02337646484375, "learning_rate": 9.046106882113752e-08, "loss": -0.0084, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 738 }, { "completion_length": 1121.4583435058594, "epoch": 0.901219512195122, "grad_norm": 0.3978723883628845, "kl": 0.02642822265625, "learning_rate": 8.828992803065772e-08, "loss": -0.0758, "reward": 0.1875000074505806, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 739 }, { "completion_length": 716.4583435058594, "epoch": 0.9024390243902439, "grad_norm": 0.7046301364898682, "kl": 0.03155517578125, "learning_rate": 8.614436908611617e-08, "loss": 0.0477, "reward": 0.3333333358168602, "reward_std": 0.21650634706020355, "rewards/accuracy_reward": 0.3333333358168602, "rewards/format_reward": 0.0, "step": 740 }, { "completion_length": 1084.0208740234375, "epoch": 0.9036585365853659, "grad_norm": 0.5535774827003479, "kl": 0.031982421875, "learning_rate": 8.402443086756273e-08, "loss": -0.1231, "reward": 0.08333333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 741 }, { "completion_length": 919.0625305175781, "epoch": 0.9048780487804878, "grad_norm": 0.23134127259254456, "kl": 0.02215576171875, "learning_rate": 8.193015179076996e-08, "loss": 0.0253, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 742 }, { "completion_length": 807.6875, "epoch": 0.9060975609756098, "grad_norm": 0.04322041571140289, "kl": 0.02398681640625, "learning_rate": 7.986156980653653e-08, "loss": 0.0009, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 743 }, { "completion_length": 938.7292175292969, "epoch": 0.9073170731707317, "grad_norm": 0.21795502305030823, "kl": 0.02130126953125, "learning_rate": 7.781872239999993e-08, "loss": -0.0017, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 744 }, { "completion_length": 762.6875, "epoch": 0.9085365853658537, "grad_norm": 0.2852475345134735, "kl": 0.0247802734375, "learning_rate": 7.580164658995603e-08, "loss": 0.0202, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 745 }, { "completion_length": 974.8333435058594, "epoch": 0.9097560975609756, "grad_norm": 0.24209746718406677, "kl": 0.02294921875, "learning_rate": 7.381037892818959e-08, "loss": -0.0242, "reward": 0.1250000037252903, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 746 }, { "completion_length": 886.3125, "epoch": 0.9109756097560976, "grad_norm": 0.47765588760375977, "kl": 0.02490234375, "learning_rate": 7.184495549881131e-08, "loss": -0.0703, "reward": 0.1458333395421505, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1458333395421505, "rewards/format_reward": 0.0, "step": 747 }, { "completion_length": 766.6458435058594, "epoch": 0.9121951219512195, "grad_norm": 0.3742899000644684, "kl": 0.02581787109375, "learning_rate": 6.990541191760418e-08, "loss": -0.004, "reward": 0.3541666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 748 }, { "completion_length": 741.5000305175781, "epoch": 0.9134146341463415, "grad_norm": 0.4017605781555176, "kl": 0.029052734375, "learning_rate": 6.799178333137784e-08, "loss": 0.0276, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 749 }, { "completion_length": 1010.5833740234375, "epoch": 0.9146341463414634, "grad_norm": 0.27121710777282715, "kl": 0.0203857421875, "learning_rate": 6.610410441733156e-08, "loss": 0.0389, "reward": 0.0625, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 750 }, { "completion_length": 790.3333740234375, "epoch": 0.9158536585365854, "grad_norm": 0.05629832670092583, "kl": 0.02557373046875, "learning_rate": 6.424240938242643e-08, "loss": 0.0009, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 751 }, { "completion_length": 862.3125, "epoch": 0.9170731707317074, "grad_norm": 0.24488425254821777, "kl": 0.0301513671875, "learning_rate": 6.24067319627642e-08, "loss": 0.0261, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 752 }, { "completion_length": 889.4791870117188, "epoch": 0.9182926829268293, "grad_norm": 0.47951433062553406, "kl": 0.0274658203125, "learning_rate": 6.059710542297824e-08, "loss": 0.011, "reward": 0.12500000558793545, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 753 }, { "completion_length": 845.4583435058594, "epoch": 0.9195121951219513, "grad_norm": 0.33234408497810364, "kl": 0.02655029296875, "learning_rate": 5.8813562555628585e-08, "loss": -0.0212, "reward": 0.0416666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 754 }, { "completion_length": 733.3541870117188, "epoch": 0.9207317073170732, "grad_norm": 0.6557818651199341, "kl": 0.031005859375, "learning_rate": 5.7056135680607965e-08, "loss": 0.046, "reward": 0.3125, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.3125, "rewards/format_reward": 0.0, "step": 755 }, { "completion_length": 630.2083435058594, "epoch": 0.9219512195121952, "grad_norm": 0.6298221945762634, "kl": 0.02850341796875, "learning_rate": 5.532485664455755e-08, "loss": 0.0159, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 756 }, { "completion_length": 761.6875305175781, "epoch": 0.9231707317073171, "grad_norm": 0.6046322584152222, "kl": 0.032470703125, "learning_rate": 5.3619756820288525e-08, "loss": -0.0381, "reward": 0.3541666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.3541666716337204, "rewards/format_reward": 0.0, "step": 757 }, { "completion_length": 716.9375305175781, "epoch": 0.9243902439024391, "grad_norm": 0.4098731279373169, "kl": 0.027587890625, "learning_rate": 5.194086710621404e-08, "loss": 0.0823, "reward": 0.125, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 758 }, { "completion_length": 937.5, "epoch": 0.925609756097561, "grad_norm": 0.37171196937561035, "kl": 0.02362060546875, "learning_rate": 5.0288217925789025e-08, "loss": 0.0248, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 759 }, { "completion_length": 794.0208740234375, "epoch": 0.926829268292683, "grad_norm": 0.20126201212406158, "kl": 0.0208740234375, "learning_rate": 4.86618392269596e-08, "loss": -0.007, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 760 }, { "completion_length": 825.875, "epoch": 0.9280487804878049, "grad_norm": 0.6448650360107422, "kl": 0.03265380859375, "learning_rate": 4.70617604816192e-08, "loss": 0.0139, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 761 }, { "completion_length": 883.6666870117188, "epoch": 0.9292682926829269, "grad_norm": 0.42482876777648926, "kl": 0.02520751953125, "learning_rate": 4.54880106850758e-08, "loss": -0.0098, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 762 }, { "completion_length": 1032.2708435058594, "epoch": 0.9304878048780488, "grad_norm": 0.46651068329811096, "kl": 0.0216064453125, "learning_rate": 4.394061835552554e-08, "loss": -0.0285, "reward": 0.10416666977107525, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 763 }, { "completion_length": 801.9375305175781, "epoch": 0.9317073170731708, "grad_norm": 0.04411710798740387, "kl": 0.0208740234375, "learning_rate": 4.2419611533536296e-08, "loss": 0.0007, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 764 }, { "completion_length": 791.7916870117188, "epoch": 0.9329268292682927, "grad_norm": 0.41850683093070984, "kl": 0.026123046875, "learning_rate": 4.0925017781539896e-08, "loss": 0.0028, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 765 }, { "completion_length": 867.875, "epoch": 0.9341463414634147, "grad_norm": 0.04630811884999275, "kl": 0.02459716796875, "learning_rate": 3.9456864183331557e-08, "loss": 0.0009, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 766 }, { "completion_length": 806.6458435058594, "epoch": 0.9353658536585366, "grad_norm": 0.04593589901924133, "kl": 0.02264404296875, "learning_rate": 3.80151773435804e-08, "loss": 0.0008, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 767 }, { "completion_length": 833.9791870117188, "epoch": 0.9365853658536586, "grad_norm": 0.2245476394891739, "kl": 0.02105712890625, "learning_rate": 3.659998338734671e-08, "loss": 0.0015, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 768 }, { "completion_length": 763.0625, "epoch": 0.9378048780487804, "grad_norm": 0.0570383220911026, "kl": 0.0294189453125, "learning_rate": 3.5211307959608475e-08, "loss": 0.001, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 769 }, { "completion_length": 796.9375, "epoch": 0.9390243902439024, "grad_norm": 0.28452613949775696, "kl": 0.0203857421875, "learning_rate": 3.3849176224796884e-08, "loss": -0.0315, "reward": 0.1875000074505806, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000074505806, "rewards/format_reward": 0.0, "step": 770 }, { "completion_length": 746.1458435058594, "epoch": 0.9402439024390243, "grad_norm": 0.5315723419189453, "kl": 0.0302734375, "learning_rate": 3.2513612866339916e-08, "loss": 0.0077, "reward": 0.125, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.125, "rewards/format_reward": 0.0, "step": 771 }, { "completion_length": 879.4375610351562, "epoch": 0.9414634146341463, "grad_norm": 0.756249725818634, "kl": 0.0343017578125, "learning_rate": 3.1204642086215817e-08, "loss": -0.0351, "reward": 0.1666666716337204, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 772 }, { "completion_length": 782.3750305175781, "epoch": 0.9426829268292682, "grad_norm": 0.28776443004608154, "kl": 0.022216796875, "learning_rate": 2.992228760451349e-08, "loss": 0.0504, "reward": 0.1666666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 773 }, { "completion_length": 723.0833740234375, "epoch": 0.9439024390243902, "grad_norm": 0.5710537433624268, "kl": 0.0245361328125, "learning_rate": 2.8666572659003965e-08, "loss": -0.0, "reward": 0.1250000037252903, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 774 }, { "completion_length": 816.8125305175781, "epoch": 0.9451219512195121, "grad_norm": 0.343589723110199, "kl": 0.01898193359375, "learning_rate": 2.743752000471761e-08, "loss": 0.0147, "reward": 0.2916666716337204, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 775 }, { "completion_length": 814.2291870117188, "epoch": 0.9463414634146341, "grad_norm": 0.40398523211479187, "kl": 0.0257568359375, "learning_rate": 2.6235151913533595e-08, "loss": 0.0236, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 776 }, { "completion_length": 745.4791870117188, "epoch": 0.947560975609756, "grad_norm": 0.6614434719085693, "kl": 0.02471923828125, "learning_rate": 2.50594901737749e-08, "loss": 0.0419, "reward": 0.12500000558793545, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 777 }, { "completion_length": 758.8333435058594, "epoch": 0.948780487804878, "grad_norm": 0.2255949229001999, "kl": 0.02142333984375, "learning_rate": 2.3910556089814294e-08, "loss": 0.0001, "reward": 0.20833333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.20833333395421505, "rewards/format_reward": 0.0, "step": 778 }, { "completion_length": 945.1875610351562, "epoch": 0.95, "grad_norm": 0.42945098876953125, "kl": 0.0296630859375, "learning_rate": 2.278837048168797e-08, "loss": 0.0276, "reward": 0.1250000037252903, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.1250000037252903, "rewards/format_reward": 0.0, "step": 779 }, { "completion_length": 918.0000305175781, "epoch": 0.9512195121951219, "grad_norm": 0.04331444576382637, "kl": 0.01953125, "learning_rate": 2.1692953684718187e-08, "loss": 0.0008, "reward": 0.0625, "reward_std": 0.0, "rewards/accuracy_reward": 0.0625, "rewards/format_reward": 0.0, "step": 780 }, { "completion_length": 679.4791870117188, "epoch": 0.9524390243902439, "grad_norm": 0.41175445914268494, "kl": 0.02374267578125, "learning_rate": 2.0624325549144894e-08, "loss": 0.0085, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 781 }, { "completion_length": 994.3125610351562, "epoch": 0.9536585365853658, "grad_norm": 0.17546556890010834, "kl": 0.02337646484375, "learning_rate": 1.9582505439766028e-08, "loss": 0.0414, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 782 }, { "completion_length": 831.7291870117188, "epoch": 0.9548780487804878, "grad_norm": 0.4530733823776245, "kl": 0.0316162109375, "learning_rate": 1.856751223558695e-08, "loss": -0.0156, "reward": 0.2291666716337204, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2291666716337204, "rewards/format_reward": 0.0, "step": 783 }, { "completion_length": 720.4583435058594, "epoch": 0.9560975609756097, "grad_norm": 0.41587570309638977, "kl": 0.0303955078125, "learning_rate": 1.7579364329477375e-08, "loss": 0.0231, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 784 }, { "completion_length": 896.4791870117188, "epoch": 0.9573170731707317, "grad_norm": 0.17405299842357635, "kl": 0.02294921875, "learning_rate": 1.661807962783851e-08, "loss": 0.0575, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 785 }, { "completion_length": 773.2708435058594, "epoch": 0.9585365853658536, "grad_norm": 0.39356529712677, "kl": 0.0245361328125, "learning_rate": 1.5683675550279943e-08, "loss": 0.0176, "reward": 0.1666666679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666679084301, "rewards/format_reward": 0.0, "step": 786 }, { "completion_length": 956.1250305175781, "epoch": 0.9597560975609756, "grad_norm": 0.3414008915424347, "kl": 0.02886962890625, "learning_rate": 1.4776169029301234e-08, "loss": 0.0331, "reward": 0.2916666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2916666716337204, "rewards/format_reward": 0.0, "step": 787 }, { "completion_length": 773.2291870117188, "epoch": 0.9609756097560975, "grad_norm": 0.34748509526252747, "kl": 0.02716064453125, "learning_rate": 1.3895576509987685e-08, "loss": 0.0049, "reward": 0.08333333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.08333333395421505, "rewards/format_reward": 0.0, "step": 788 }, { "completion_length": 677.2916870117188, "epoch": 0.9621951219512195, "grad_norm": 0.3142717182636261, "kl": 0.02398681640625, "learning_rate": 1.3041913949710715e-08, "loss": 0.0035, "reward": 0.14583333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 789 }, { "completion_length": 958.1250305175781, "epoch": 0.9634146341463414, "grad_norm": 0.25102928280830383, "kl": 0.0235595703125, "learning_rate": 1.2215196817839447e-08, "loss": 0.0045, "reward": 0.1458333432674408, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333432674408, "rewards/format_reward": 0.0, "step": 790 }, { "completion_length": 911.7708435058594, "epoch": 0.9646341463414634, "grad_norm": 0.34268632531166077, "kl": 0.02667236328125, "learning_rate": 1.1415440095460083e-08, "loss": 0.0186, "reward": 0.2083333432674408, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 791 }, { "completion_length": 738.9583740234375, "epoch": 0.9658536585365853, "grad_norm": 0.31120502948760986, "kl": 0.02655029296875, "learning_rate": 1.06426582751043e-08, "loss": 0.0329, "reward": 0.2500000149011612, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 792 }, { "completion_length": 906.5208435058594, "epoch": 0.9670731707317073, "grad_norm": 0.4276106357574463, "kl": 0.02557373046875, "learning_rate": 9.896865360487451e-09, "loss": 0.0771, "reward": 0.14583333395421505, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.14583333395421505, "rewards/format_reward": 0.0, "step": 793 }, { "completion_length": 715.0833435058594, "epoch": 0.9682926829268292, "grad_norm": 0.6245980858802795, "kl": 0.03021240234375, "learning_rate": 9.178074866253605e-09, "loss": -0.0076, "reward": 0.1666666716337204, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 794 }, { "completion_length": 973.9583435058594, "epoch": 0.9695121951219512, "grad_norm": 0.40942537784576416, "kl": 0.0260009765625, "learning_rate": 8.486299817731412e-09, "loss": 0.0285, "reward": 0.2708333432674408, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2708333432674408, "rewards/format_reward": 0.0, "step": 795 }, { "completion_length": 1057.3125305175781, "epoch": 0.9707317073170731, "grad_norm": 0.3129737973213196, "kl": 0.023193359375, "learning_rate": 7.821552750697958e-09, "loss": 0.0336, "reward": 0.1875000111758709, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 796 }, { "completion_length": 657.4583740234375, "epoch": 0.9719512195121951, "grad_norm": 0.4153045117855072, "kl": 0.02880859375, "learning_rate": 7.1838457111516044e-09, "loss": -0.0107, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 797 }, { "completion_length": 1035.6875610351562, "epoch": 0.973170731707317, "grad_norm": 0.0584120973944664, "kl": 0.023193359375, "learning_rate": 6.573190255093342e-09, "loss": 0.0009, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 798 }, { "completion_length": 841.0416870117188, "epoch": 0.974390243902439, "grad_norm": 0.5308164954185486, "kl": 0.03045654296875, "learning_rate": 5.989597448317785e-09, "loss": 0.0024, "reward": 0.1458333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 799 }, { "completion_length": 785.0208740234375, "epoch": 0.975609756097561, "grad_norm": 0.4039956331253052, "kl": 0.0228271484375, "learning_rate": 5.433077866212999e-09, "loss": 0.0233, "reward": 0.291666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.291666679084301, "rewards/format_reward": 0.0, "step": 800 }, { "completion_length": 893.2500305175781, "epoch": 0.9768292682926829, "grad_norm": 0.5676692128181458, "kl": 0.03875732421875, "learning_rate": 4.903641593567654e-09, "loss": -0.0039, "reward": 0.229166679084301, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.229166679084301, "rewards/format_reward": 0.0, "step": 801 }, { "completion_length": 719.5416870117188, "epoch": 0.9780487804878049, "grad_norm": 0.3240124583244324, "kl": 0.02880859375, "learning_rate": 4.401298224389338e-09, "loss": 0.0029, "reward": 0.1041666679084301, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1041666679084301, "rewards/format_reward": 0.0, "step": 802 }, { "completion_length": 845.0833435058594, "epoch": 0.9792682926829268, "grad_norm": 0.33833006024360657, "kl": 0.026123046875, "learning_rate": 3.926056861730532e-09, "loss": 0.0627, "reward": 0.10416666977107525, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.10416666977107525, "rewards/format_reward": 0.0, "step": 803 }, { "completion_length": 827.5208435058594, "epoch": 0.9804878048780488, "grad_norm": 0.3172595798969269, "kl": 0.02911376953125, "learning_rate": 3.4779261175232334e-09, "loss": -0.0376, "reward": 0.12500000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.12500000558793545, "rewards/format_reward": 0.0, "step": 804 }, { "completion_length": 686.9583435058594, "epoch": 0.9817073170731707, "grad_norm": 0.3726345896720886, "kl": 0.0220947265625, "learning_rate": 3.0569141124234256e-09, "loss": -0.0216, "reward": 0.1041666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1041666716337204, "rewards/format_reward": 0.0, "step": 805 }, { "completion_length": 852.8333435058594, "epoch": 0.9829268292682927, "grad_norm": 0.4700186848640442, "kl": 0.0269775390625, "learning_rate": 2.6630284756635204e-09, "loss": -0.0481, "reward": 0.0833333358168602, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 806 }, { "completion_length": 827.2916870117188, "epoch": 0.9841463414634146, "grad_norm": 0.05038120225071907, "kl": 0.02862548828125, "learning_rate": 2.2962763449141387e-09, "loss": 0.001, "reward": 0.1875, "reward_std": 0.0, "rewards/accuracy_reward": 0.1875, "rewards/format_reward": 0.0, "step": 807 }, { "completion_length": 835.0, "epoch": 0.9853658536585366, "grad_norm": 0.3867391049861908, "kl": 0.02215576171875, "learning_rate": 1.9566643661550478e-09, "loss": 0.0422, "reward": 0.2083333432674408, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.2083333432674408, "rewards/format_reward": 0.0, "step": 808 }, { "completion_length": 799.6458435058594, "epoch": 0.9865853658536585, "grad_norm": 0.3166770935058594, "kl": 0.023681640625, "learning_rate": 1.6441986935545884e-09, "loss": -0.0102, "reward": 0.02083333395421505, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.02083333395421505, "rewards/format_reward": 0.0, "step": 809 }, { "completion_length": 727.6041870117188, "epoch": 0.9878048780487805, "grad_norm": 0.17712058126926422, "kl": 0.0240478515625, "learning_rate": 1.3588849893579336e-09, "loss": -0.0024, "reward": 0.0833333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.0833333358168602, "rewards/format_reward": 0.0, "step": 810 }, { "completion_length": 1028.0833740234375, "epoch": 0.9890243902439024, "grad_norm": 0.04795070365071297, "kl": 0.0245361328125, "learning_rate": 1.1007284237850025e-09, "loss": 0.0009, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward": 0.0, "rewards/format_reward": 0.0, "step": 811 }, { "completion_length": 856.7291870117188, "epoch": 0.9902439024390244, "grad_norm": 0.36679479479789734, "kl": 0.0206298828125, "learning_rate": 8.697336749358687e-10, "loss": -0.0008, "reward": 0.1458333358168602, "reward_std": 0.03608439117670059, "rewards/accuracy_reward": 0.1458333358168602, "rewards/format_reward": 0.0, "step": 812 }, { "completion_length": 1035.4167175292969, "epoch": 0.9914634146341463, "grad_norm": 0.2617432773113251, "kl": 0.0228271484375, "learning_rate": 6.659049287071617e-10, "loss": 0.0244, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 813 }, { "completion_length": 999.3125305175781, "epoch": 0.9926829268292683, "grad_norm": 0.4567527174949646, "kl": 0.02532958984375, "learning_rate": 4.892458787154608e-10, "loss": 0.0007, "reward": 0.0416666679084301, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.0416666679084301, "rewards/format_reward": 0.0, "step": 814 }, { "completion_length": 858.7291870117188, "epoch": 0.9939024390243902, "grad_norm": 0.6128376126289368, "kl": 0.0572509765625, "learning_rate": 3.397597262300156e-10, "loss": -0.0398, "reward": 0.2500000149011612, "reward_std": 0.18042195588350296, "rewards/accuracy_reward": 0.2500000149011612, "rewards/format_reward": 0.0, "step": 815 }, { "completion_length": 1070.7708740234375, "epoch": 0.9951219512195122, "grad_norm": 0.3184763491153717, "kl": 0.02508544921875, "learning_rate": 2.1744918011595837e-10, "loss": 0.0767, "reward": 0.18750000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 816 }, { "completion_length": 882.7917175292969, "epoch": 0.9963414634146341, "grad_norm": 0.33201614022254944, "kl": 0.030517578125, "learning_rate": 1.2231645678401072e-10, "loss": 0.0595, "reward": 0.2083333358168602, "reward_std": 0.14433756470680237, "rewards/accuracy_reward": 0.2083333358168602, "rewards/format_reward": 0.0, "step": 817 }, { "completion_length": 911.9583740234375, "epoch": 0.9975609756097561, "grad_norm": 0.33788737654685974, "kl": 0.02301025390625, "learning_rate": 5.436328015101522e-11, "loss": 0.0052, "reward": 0.1666666716337204, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1666666716337204, "rewards/format_reward": 0.0, "step": 818 }, { "completion_length": 693.2083740234375, "epoch": 0.998780487804878, "grad_norm": 0.5074701905250549, "kl": 0.01910400390625, "learning_rate": 1.359088160846067e-11, "loss": -0.0122, "reward": 0.1875000111758709, "reward_std": 0.07216878235340118, "rewards/accuracy_reward": 0.1875000111758709, "rewards/format_reward": 0.0, "step": 819 }, { "completion_length": 1057.03125, "epoch": 1.0, "grad_norm": 0.466864675283432, "kl": 0.02581787109375, "learning_rate": 0.0, "loss": -0.0094, "reward": 0.18750000558793545, "reward_std": 0.10825317353010178, "rewards/accuracy_reward": 0.18750000558793545, "rewards/format_reward": 0.0, "step": 820 }, { "epoch": 1.0, "step": 820, "total_flos": 0.0, "train_loss": 0.004393076130298962, "train_runtime": 23663.0394, "train_samples_per_second": 0.554, "train_steps_per_second": 0.035 } ], "logging_steps": 1, "max_steps": 820, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }