{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994666666666666, "eval_steps": 100, "global_step": 937, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 195.34375, "epoch": 0.0010666666666666667, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0044, "reward": -0.05581664200872183, "reward_std": 0.22136071452405304, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.2953999750316143, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.2395833432674408, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 194.265625, "epoch": 0.005333333333333333, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.034, "reward": -0.029378876788541675, "reward_std": 0.2783559260133188, "rewards/accuracy_reward": 0.015625, "rewards/cosine_scaled_reward": -0.27677471633069217, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.23177084093913436, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 195.75625, "epoch": 0.010666666666666666, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0113, "reward": 0.03780887741595507, "reward_std": 0.32640064391307533, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.299691129103303, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.3250000080093741, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 187.6375, "epoch": 0.016, "grad_norm": 2.727306365966797, "kl": 0.0, "learning_rate": 5.319148936170213e-08, "loss": 0.0319, "reward": 0.02281488720327616, "reward_std": 0.3450261281337589, "rewards/accuracy_reward": 0.0375, "rewards/cosine_scaled_reward": -0.2584351147990674, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.2437500048428774, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 195.69375, "epoch": 0.021333333333333333, "grad_norm": 2.623044013977051, "kl": -7.709860801696777e-06, "learning_rate": 3.1914893617021275e-07, "loss": -0.0101, "reward": -0.04130282774567604, "reward_std": 0.2893871849635616, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.29546949565410613, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.24166667126119137, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 191.08125, "epoch": 0.02666666666666667, "grad_norm": 2.724001884460449, "kl": 3.969669342041015e-06, "learning_rate": 5.851063829787235e-07, "loss": 0.0142, "reward": 0.07371731325984002, "reward_std": 0.3645049626007676, "rewards/accuracy_reward": 0.05625, "rewards/cosine_scaled_reward": -0.2554493617266417, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.27291667386889457, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 197.31875, "epoch": 0.032, "grad_norm": 2.69883131980896, "kl": 0.00025610625743865967, "learning_rate": 8.510638297872341e-07, "loss": 0.0041, "reward": -0.03928825343027711, "reward_std": 0.31049659312702715, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.3247049249708652, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.27916667200624945, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 198.0125, "epoch": 0.037333333333333336, "grad_norm": 2.714967727661133, "kl": 0.0017081737518310548, "learning_rate": 1.1170212765957447e-06, "loss": -0.0011, "reward": -0.013306560833007098, "reward_std": 0.2835619566962123, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.3153898956254125, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.2770833408460021, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 199.4875, "epoch": 0.042666666666666665, "grad_norm": 2.374499559402466, "kl": 0.0033346176147460937, "learning_rate": 1.3829787234042555e-06, "loss": 0.0038, "reward": 0.08598366118967533, "reward_std": 0.29695698702707884, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.3035996824502945, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.3770833430811763, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 198.94375, "epoch": 0.048, "grad_norm": 2.4198946952819824, "kl": 0.007924556732177734, "learning_rate": 1.648936170212766e-06, "loss": 0.0057, "reward": 0.1478586002252996, "reward_std": 0.251297368388623, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.31672474220395086, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.46458334363996984, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 199.36875, "epoch": 0.05333333333333334, "grad_norm": 2.5116357803344727, "kl": 0.01243419647216797, "learning_rate": 1.9148936170212767e-06, "loss": 0.0035, "reward": 0.2862342089414597, "reward_std": 0.2713195723015815, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.29501580335199834, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.5750000137835741, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 197.58125, "epoch": 0.058666666666666666, "grad_norm": 2.543461561203003, "kl": 0.020899581909179687, "learning_rate": 2.1808510638297876e-06, "loss": -0.0066, "reward": 0.43956980630755427, "reward_std": 0.2677960195578635, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.2791802009567618, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.6875000156462192, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 199.325, "epoch": 0.064, "grad_norm": 2.8129403591156006, "kl": 0.042889404296875, "learning_rate": 2.446808510638298e-06, "loss": -0.004, "reward": 0.4680780492722988, "reward_std": 0.18648194698616863, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.33192195519804957, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8000000169500708, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 194.63125, "epoch": 0.06933333333333333, "grad_norm": 2.7215657234191895, "kl": 0.045915985107421876, "learning_rate": 2.7127659574468084e-06, "loss": 0.0162, "reward": 0.6141613692045211, "reward_std": 0.23586739597376435, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.269171973131597, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8583333477377891, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 194.975, "epoch": 0.07466666666666667, "grad_norm": 2.7333619594573975, "kl": 0.06381301879882813, "learning_rate": 2.978723404255319e-06, "loss": -0.005, "reward": 0.7049825556576252, "reward_std": 0.2429952388862148, "rewards/accuracy_reward": 0.04375, "rewards/cosine_scaled_reward": -0.23876744713634251, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9000000089406968, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 188.5, "epoch": 0.08, "grad_norm": 3.1279549598693848, "kl": 0.09346847534179688, "learning_rate": 3.191489361702128e-06, "loss": -0.0124, "reward": 0.6519632238894701, "reward_std": 0.20673316456377505, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.26470344662666323, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8979166761040688, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 186.46875, "epoch": 0.08533333333333333, "grad_norm": 2.657968759536743, "kl": 0.10477294921875, "learning_rate": 3.457446808510639e-06, "loss": 0.0064, "reward": 0.685008542239666, "reward_std": 0.20272360693197697, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.2337414619512856, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.906250013411045, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 181.475, "epoch": 0.09066666666666667, "grad_norm": 3.079928159713745, "kl": 0.13871002197265625, "learning_rate": 3.723404255319149e-06, "loss": 0.0185, "reward": 0.743844810128212, "reward_std": 0.08398498701862991, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.22073852475732564, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9645833358168602, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 184.05, "epoch": 0.096, "grad_norm": 3.4379212856292725, "kl": 0.17229766845703126, "learning_rate": 3.98936170212766e-06, "loss": 0.0396, "reward": 0.799515800178051, "reward_std": 0.12321573820663616, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.1963175404816866, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9645833358168602, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 179.325, "epoch": 0.10133333333333333, "grad_norm": 3.069020986557007, "kl": 0.20455322265625, "learning_rate": 4.255319148936171e-06, "loss": 0.0521, "reward": 0.8741896212100982, "reward_std": 0.19565205958206205, "rewards/accuracy_reward": 0.04375, "rewards/cosine_scaled_reward": -0.14247704413719475, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9729166701436043, "step": 95 }, { "epoch": 0.10666666666666667, "grad_norm": 5.592884063720703, "learning_rate": 4.414893617021277e-06, "loss": 0.0194, "step": 100 }, { "epoch": 0.10666666666666667, "eval_clip_ratio": 0.0, "eval_completion_length": 176.2587, "eval_kl": 0.447543896484375, "eval_loss": 0.07140910625457764, "eval_reward": 0.7571212956573814, "eval_reward_std": 0.33554665619740265, "eval_rewards/accuracy_reward": 0.035, "eval_rewards/cosine_scaled_reward": -0.16731203964566813, "eval_rewards/format_reward": 0.0034, "eval_rewards/reasoning_steps_reward": 0.8860333378657699, "eval_runtime": 45451.5577, "eval_samples_per_second": 0.11, "eval_steps_per_second": 0.028, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 168.865625, "epoch": 0.112, "grad_norm": 6.593380451202393, "kl": 0.4110877990722656, "learning_rate": 4.680851063829788e-06, "loss": 0.0897, "reward": 0.7458049319684505, "reward_std": 0.2787426192197017, "rewards/accuracy_reward": 0.015625, "rewards/cosine_scaled_reward": -0.15940340738743544, "rewards/format_reward": 0.00625, "rewards/reasoning_steps_reward": 0.8833333402872086, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 164.0125, "epoch": 0.11733333333333333, "grad_norm": 8.10860538482666, "kl": 0.537298583984375, "learning_rate": 4.946808510638298e-06, "loss": 0.0605, "reward": 0.7644045952707529, "reward_std": 0.3260716760531068, "rewards/accuracy_reward": 0.0375, "rewards/cosine_scaled_reward": -0.1293454023078084, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8562500037252903, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 139.275, "epoch": 0.12266666666666666, "grad_norm": 4.738821029663086, "kl": 0.5784271240234375, "learning_rate": 4.9998437598688195e-06, "loss": 0.0979, "reward": 0.9231873728334904, "reward_std": 0.30108860426116735, "rewards/accuracy_reward": 0.06875, "rewards/cosine_scaled_reward": -0.07472930029034615, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9291666746139526, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 120.325, "epoch": 0.128, "grad_norm": 4.913420677185059, "kl": 0.8223297119140625, "learning_rate": 4.998889029787758e-06, "loss": 0.0936, "reward": 0.9338858745992183, "reward_std": 0.403242010390386, "rewards/accuracy_reward": 0.0875, "rewards/cosine_scaled_reward": -0.022364137368276714, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8687500029802322, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 154.9625, "epoch": 0.13333333333333333, "grad_norm": 13.690719604492188, "kl": 2.79224853515625, "learning_rate": 4.997500548457231e-06, "loss": 0.208, "reward": 0.8483377784490586, "reward_std": 0.41392044560052454, "rewards/accuracy_reward": 0.08125, "rewards/cosine_scaled_reward": -0.11207889374345541, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8791666701436043, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 169.90625, "epoch": 0.13866666666666666, "grad_norm": 5.515982151031494, "kl": 6.988699340820313, "learning_rate": 4.9955571065548795e-06, "loss": 0.3533, "reward": 0.5842950815334916, "reward_std": 0.38892504015238955, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.21153825148940086, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.7770833354443312, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 150.0, "epoch": 0.144, "grad_norm": 5.65593147277832, "kl": 0.9134048461914063, "learning_rate": 4.992348060495989e-06, "loss": 0.0899, "reward": 0.979816447198391, "reward_std": 0.3385909158969298, "rewards/accuracy_reward": 0.10625, "rewards/cosine_scaled_reward": -0.06810021735727786, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9416666716337204, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 168.94375, "epoch": 0.14933333333333335, "grad_norm": 12.159282684326172, "kl": 1.6743255615234376, "learning_rate": 4.9882736864879e-06, "loss": 0.1391, "reward": 0.8389136493206024, "reward_std": 0.3920943819917738, "rewards/accuracy_reward": 0.08125, "rewards/cosine_scaled_reward": -0.1631696756929159, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9208333402872085, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 169.19375, "epoch": 0.15466666666666667, "grad_norm": 14.845711708068848, "kl": 1.3467193603515626, "learning_rate": 4.983335399128258e-06, "loss": 0.1243, "reward": 0.8044680153485387, "reward_std": 0.2859855240676552, "rewards/accuracy_reward": 0.05, "rewards/cosine_scaled_reward": -0.1351153214694932, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8895833373069764, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 160.75625, "epoch": 0.16, "grad_norm": 18.14019012451172, "kl": 1.78900146484375, "learning_rate": 4.977534912960124e-06, "loss": 0.1092, "reward": 0.8079865228617564, "reward_std": 0.31241205376572906, "rewards/accuracy_reward": 0.04375, "rewards/cosine_scaled_reward": -0.11076348531059921, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8750000029802323, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 144.0125, "epoch": 0.16533333333333333, "grad_norm": 20.268800735473633, "kl": 3.7872955322265627, "learning_rate": 4.970874241876697e-06, "loss": 0.2331, "reward": 0.6273253193552364, "reward_std": 0.35022516273102156, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.13309134985704532, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.7416666714474559, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 143.4125, "epoch": 0.17066666666666666, "grad_norm": 11.427456855773926, "kl": 3.261151123046875, "learning_rate": 4.963355698422092e-06, "loss": 0.193, "reward": 0.6995385489520232, "reward_std": 0.3108037303014498, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.1004614585451236, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.7750000052154065, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 145.35, "epoch": 0.176, "grad_norm": 5.5330586433410645, "kl": 2.344598388671875, "learning_rate": 4.954981892988451e-06, "loss": 0.2493, "reward": 0.8669513031840325, "reward_std": 0.20902994847856463, "rewards/accuracy_reward": 0.03125, "rewards/cosine_scaled_reward": -0.103882029466331, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9395833387970924, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 145.88125, "epoch": 0.18133333333333335, "grad_norm": 17.83176040649414, "kl": 2.1351898193359373, "learning_rate": 4.945755732909625e-06, "loss": 0.2572, "reward": 0.8633685514330864, "reward_std": 0.1485096547054127, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.10954811349511147, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.960416667163372, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 123.525, "epoch": 0.18666666666666668, "grad_norm": 10.693355560302734, "kl": 3.7545166015625, "learning_rate": 4.935680421451764e-06, "loss": 0.2588, "reward": 0.8673717919737101, "reward_std": 0.2996328216511756, "rewards/accuracy_reward": 0.0375, "rewards/cosine_scaled_reward": -0.0555448760278523, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8854166723787784, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 99.0375, "epoch": 0.192, "grad_norm": 8.319129943847656, "kl": 2.70517578125, "learning_rate": 4.924759456701167e-06, "loss": 0.2177, "reward": 0.8939216539263726, "reward_std": 0.30960728515638036, "rewards/accuracy_reward": 0.0375, "rewards/cosine_scaled_reward": -0.020661678398028016, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8770833417773247, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 93.4625, "epoch": 0.19733333333333333, "grad_norm": 9.506866455078125, "kl": 3.8556793212890623, "learning_rate": 4.912996630349765e-06, "loss": 0.2313, "reward": 0.8351011492311955, "reward_std": 0.26371640426805243, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.04614884976763278, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8625000029802322, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 99.05, "epoch": 0.20266666666666666, "grad_norm": 10.067543983459473, "kl": 2.504437255859375, "learning_rate": 4.900396026378671e-06, "loss": 0.2384, "reward": 0.9122196048498153, "reward_std": 0.17188128359848634, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.046113729919306935, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9458333417773247, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 115.89375, "epoch": 0.208, "grad_norm": 14.831565856933594, "kl": 6.83585205078125, "learning_rate": 4.886962019640244e-06, "loss": 0.5534, "reward": 0.8909050643444061, "reward_std": 0.13241582510527222, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.08409493574872613, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9687500029802323, "step": 195 }, { "epoch": 0.21333333333333335, "grad_norm": 13.112170219421387, "learning_rate": 4.872699274339169e-06, "loss": 0.4056, "step": 200 }, { "epoch": 0.21333333333333335, "eval_clip_ratio": 0.0, "eval_completion_length": 104.26975, "eval_kl": NaN, "eval_loss": NaN, "eval_reward": 0.8497997302287249, "eval_reward_std": 0.22811159837578293, "eval_rewards/accuracy_reward": 0.01425, "eval_rewards/cosine_scaled_reward": -0.08245027167908939, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9180000054657459, "eval_runtime": 36604.6644, "eval_samples_per_second": 0.137, "eval_steps_per_second": 0.034, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 101.375, "epoch": 0.21866666666666668, "grad_norm": 13.828842163085938, "kl": 4.618588256835937, "learning_rate": 4.857612742413072e-06, "loss": 0.45, "reward": 0.8754412285983563, "reward_std": 0.22013528265815693, "rewards/accuracy_reward": 0.015625, "rewards/cosine_scaled_reward": -0.0776837759069167, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9375000052154064, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 119.96875, "epoch": 0.224, "grad_norm": 9.70358657836914, "kl": 6.018792724609375, "learning_rate": 4.8417076618132434e-06, "loss": 0.4681, "reward": 0.7821227680891752, "reward_std": 0.23853054337669163, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.12204390410333872, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8979166731238365, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 124.28125, "epoch": 0.22933333333333333, "grad_norm": 25.299379348754883, "kl": 4.7243896484375, "learning_rate": 4.824989554686043e-06, "loss": 0.3798, "reward": 0.8066352348774671, "reward_std": 0.267948625725694, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.10794810801744462, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8958333440124988, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 121.075, "epoch": 0.23466666666666666, "grad_norm": 7.767666816711426, "kl": 3.927545166015625, "learning_rate": 4.807464225455655e-06, "loss": 0.3358, "reward": 0.8636085368692875, "reward_std": 0.26568231563433076, "rewards/accuracy_reward": 0.0375, "rewards/cosine_scaled_reward": -0.07389147193171083, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8999999985098839, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 132.05625, "epoch": 0.24, "grad_norm": 11.310062408447266, "kl": 5.49139404296875, "learning_rate": 4.789137758808823e-06, "loss": 0.432, "reward": 0.7667079947888851, "reward_std": 0.25788455196889115, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.1312086760997772, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8916666679084301, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 100.15, "epoch": 0.24533333333333332, "grad_norm": 7.714837074279785, "kl": 2.090936279296875, "learning_rate": 4.770016517582283e-06, "loss": 0.2939, "reward": 0.9156014438718557, "reward_std": 0.33299890445778146, "rewards/accuracy_reward": 0.05, "rewards/cosine_scaled_reward": -0.03231522748246789, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8979166761040688, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 116.16875, "epoch": 0.25066666666666665, "grad_norm": 29.340322494506836, "kl": 8.26297607421875, "learning_rate": 4.750107140553627e-06, "loss": 0.5621, "reward": 0.7611351676285267, "reward_std": 0.3555757596914191, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.11594817549921572, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8583333417773247, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 110.64375, "epoch": 0.256, "grad_norm": 14.472433090209961, "kl": 6.403857421875, "learning_rate": 4.7294165401363616e-06, "loss": 0.5554, "reward": 0.7904055327177048, "reward_std": 0.28852546858834105, "rewards/accuracy_reward": 0.0125, "rewards/cosine_scaled_reward": -0.1158444695873186, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8937500074505806, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 82.34375, "epoch": 0.2613333333333333, "grad_norm": 13.236370086669922, "kl": 6.80526123046875, "learning_rate": 4.712306397324877e-06, "loss": 0.5688, "reward": 0.8986638426780701, "reward_std": 0.2791194328689016, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.049252831703051926, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9229166701436042, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 96.94375, "epoch": 0.26666666666666666, "grad_norm": 17.91639518737793, "kl": 11.293544006347656, "learning_rate": 4.69022787828549e-06, "loss": 0.8827, "reward": 0.7485023282468319, "reward_std": 0.35306237193290146, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.10983100975863636, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8583333387970924, "step": 250 }, { "clip_ratio": 0.0, "completion_length": 77.75625, "epoch": 0.272, "grad_norm": 16.45020866394043, "kl": 5.82073974609375, "learning_rate": 4.66738892556983e-06, "loss": 0.5283, "reward": 0.8968394428491593, "reward_std": 0.24545104982098565, "rewards/accuracy_reward": 0.01875, "rewards/cosine_scaled_reward": -0.046910554519854486, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9250000029802322, "step": 255 }, { "clip_ratio": 0.0, "completion_length": 77.80625, "epoch": 0.2773333333333333, "grad_norm": 32.926937103271484, "kl": 9.4225341796875, "learning_rate": 4.643797468722099e-06, "loss": 0.8073, "reward": 0.8601903270930051, "reward_std": 0.18378369295678568, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0877263396163471, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9479166686534881, "step": 260 }, { "clip_ratio": 0.0, "completion_length": 81.775, "epoch": 0.2826666666666667, "grad_norm": 12.122540473937988, "kl": 9.3671630859375, "learning_rate": 4.6194616985513144e-06, "loss": 0.7643, "reward": 0.8126994274556637, "reward_std": 0.26541943780030125, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.09563390930416063, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9020833432674408, "step": 265 }, { "clip_ratio": 0.0, "completion_length": 65.30625, "epoch": 0.288, "grad_norm": 14.13431453704834, "kl": 7.41337890625, "learning_rate": 4.594390064287515e-06, "loss": 0.6638, "reward": 0.8327964015305043, "reward_std": 0.2086354006532929, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06512026621494442, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8979166716337204, "step": 270 }, { "clip_ratio": 0.0, "completion_length": 63.6625, "epoch": 0.29333333333333333, "grad_norm": 19.133230209350586, "kl": 8.81510009765625, "learning_rate": 4.568591270648233e-06, "loss": 0.7184, "reward": 0.8030373096466065, "reward_std": 0.26814147859986404, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06571269998094068, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8687500089406968, "step": 275 }, { "clip_ratio": 0.0, "completion_length": 65.8875, "epoch": 0.2986666666666667, "grad_norm": 20.222061157226562, "kl": 9.532666015625, "learning_rate": 4.5420742748162735e-06, "loss": 0.786, "reward": 0.8763291284441947, "reward_std": 0.17602166483411566, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06533753902185709, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9416666701436043, "step": 280 }, { "clip_ratio": 0.0, "completion_length": 89.26875, "epoch": 0.304, "grad_norm": 11.2631254196167, "kl": 11.497705078125, "learning_rate": 4.514848283329835e-06, "loss": 0.9101, "reward": 0.8311819508671761, "reward_std": 0.2350232223427156, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.10215137323830277, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9333333358168602, "step": 285 }, { "clip_ratio": 0.0, "completion_length": 78.51875, "epoch": 0.30933333333333335, "grad_norm": 22.054162979125977, "kl": 9.7431640625, "learning_rate": 4.486922748886054e-06, "loss": 0.8153, "reward": 0.8432315267622471, "reward_std": 0.226219168829266, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07551847096183337, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9187500044703484, "step": 290 }, { "clip_ratio": 0.0, "completion_length": 62.14375, "epoch": 0.31466666666666665, "grad_norm": 21.374130249023438, "kl": 10.1951416015625, "learning_rate": 4.458307367059092e-06, "loss": 0.9005, "reward": 0.8722259551286697, "reward_std": 0.19159141974596422, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05902404521766584, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9312499985098839, "step": 295 }, { "epoch": 0.32, "grad_norm": 24.809528350830078, "learning_rate": 4.4290120729338835e-06, "loss": 1.1358, "step": 300 }, { "epoch": 0.32, "eval_clip_ratio": 0.0, "eval_completion_length": 65.7632, "eval_kl": Infinity, "eval_loss": 1.0562268495559692, "eval_reward": 0.7877680493371532, "eval_reward_std": 0.28633810927099196, "eval_rewards/accuracy_reward": 0.0008, "eval_rewards/cosine_scaled_reward": -0.08163195236857573, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.8686000056952238, "eval_runtime": 31706.4113, "eval_samples_per_second": 0.158, "eval_steps_per_second": 0.039, "step": 300 }, { "clip_ratio": 0.0, "completion_length": 66.99375, "epoch": 0.3253333333333333, "grad_norm": 16.53313636779785, "kl": 13.25738525390625, "learning_rate": 4.399047037656741e-06, "loss": 1.0307, "reward": 0.7880359996110201, "reward_std": 0.296626356554043, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08279733196541202, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8708333365619183, "step": 305 }, { "clip_ratio": 0.0, "completion_length": 47.05625, "epoch": 0.33066666666666666, "grad_norm": 21.8547306060791, "kl": 9.9548583984375, "learning_rate": 4.368422664903997e-06, "loss": 0.8694, "reward": 0.882577420771122, "reward_std": 0.18155750810219615, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05075591259810608, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9333333373069763, "step": 310 }, { "clip_ratio": 0.0, "completion_length": 52.09375, "epoch": 0.336, "grad_norm": 32.71742248535156, "kl": 13.6111328125, "learning_rate": 4.3371495872699044e-06, "loss": 1.0447, "reward": 0.8353129029273987, "reward_std": 0.24553734397513835, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06052043429663172, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8958333328366279, "step": 315 }, { "clip_ratio": 0.0, "completion_length": 49.075, "epoch": 0.3413333333333333, "grad_norm": 17.508634567260742, "kl": 9.9728515625, "learning_rate": 4.305238662575073e-06, "loss": 0.8065, "reward": 0.8548699423670769, "reward_std": 0.19330476764516788, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.047213390382239595, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.902083334326744, "step": 320 }, { "clip_ratio": 0.0, "completion_length": 52.6125, "epoch": 0.3466666666666667, "grad_norm": 15.830941200256348, "kl": 12.16015625, "learning_rate": 4.272700970096696e-06, "loss": 1.0147, "reward": 0.7913525246083737, "reward_std": 0.26694968109495676, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.058647475835459775, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8500000044703484, "step": 325 }, { "clip_ratio": 0.0, "completion_length": 48.8875, "epoch": 0.352, "grad_norm": 14.494447708129883, "kl": 14.19609375, "learning_rate": 4.239547806721892e-06, "loss": 1.1633, "reward": 0.76911461353302, "reward_std": 0.318796195685718, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0558853830647422, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8250000014901161, "step": 330 }, { "clip_ratio": 0.0, "completion_length": 32.3625, "epoch": 0.35733333333333334, "grad_norm": 21.269908905029297, "kl": 11.915087890625, "learning_rate": 4.2057906830255006e-06, "loss": 0.9898, "reward": 0.8679051876068116, "reward_std": 0.2277104783368486, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03626146233000327, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9041666701436043, "step": 335 }, { "clip_ratio": 0.0, "completion_length": 18.35625, "epoch": 0.3626666666666667, "grad_norm": 1.002107858657837, "kl": 8.1767578125, "learning_rate": 4.1714413192736756e-06, "loss": 0.52, "reward": 0.9522343382239342, "reward_std": 0.09452941585068401, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.012348967575235292, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9645833373069763, "step": 340 }, { "clip_ratio": 0.0, "completion_length": 27.9375, "epoch": 0.368, "grad_norm": 16.14938735961914, "kl": 11.773193359375, "learning_rate": 4.1365116413546835e-06, "loss": 0.8212, "reward": 0.899377702921629, "reward_std": 0.16825042173995824, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03187227531598182, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9312500044703483, "step": 345 }, { "clip_ratio": 0.0, "completion_length": 52.64375, "epoch": 0.37333333333333335, "grad_norm": 12.938347816467285, "kl": 16.27109375, "learning_rate": 4.101013776638309e-06, "loss": 1.218, "reward": 0.8322627246379852, "reward_std": 0.24702412509959687, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06565392787888413, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8979166731238365, "step": 350 }, { "clip_ratio": 0.0, "completion_length": 48.01875, "epoch": 0.37866666666666665, "grad_norm": 23.911727905273438, "kl": 10.50517578125, "learning_rate": 4.064960049765304e-06, "loss": 1.0027, "reward": 0.8219170615077018, "reward_std": 0.28035222916250857, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04891626287571853, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8708333402872086, "step": 355 }, { "clip_ratio": 0.0, "completion_length": 30.28125, "epoch": 0.384, "grad_norm": 7.875117301940918, "kl": 11.235986328125, "learning_rate": 4.028362978368352e-06, "loss": 0.9222, "reward": 0.8919128350913524, "reward_std": 0.20153675045185082, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03517048052453901, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9270833373069763, "step": 360 }, { "clip_ratio": 0.0, "completion_length": 35.05625, "epoch": 0.3893333333333333, "grad_norm": 41.62312316894531, "kl": 15.83330078125, "learning_rate": 3.991235268726016e-06, "loss": 1.048, "reward": 0.8823194235563279, "reward_std": 0.17005571061081354, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04684721886005718, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.929166667163372, "step": 365 }, { "clip_ratio": 0.0, "completion_length": 42.98125, "epoch": 0.39466666666666667, "grad_norm": 6.0660810470581055, "kl": 16.304296875, "learning_rate": 3.9535898113512046e-06, "loss": 1.1516, "reward": 0.8508514143526554, "reward_std": 0.23275069000383156, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06789856371178757, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.91875, "step": 370 }, { "clip_ratio": 0.0, "completion_length": 50.45625, "epoch": 0.4, "grad_norm": 21.067644119262695, "kl": 12.8263671875, "learning_rate": 3.91543967651566e-06, "loss": 1.0909, "reward": 0.824290581792593, "reward_std": 0.2590523644972563, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07987606586975744, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.904166667163372, "step": 375 }, { "clip_ratio": 0.0, "completion_length": 56.78125, "epoch": 0.4053333333333333, "grad_norm": 36.3121452331543, "kl": 13.694140625, "learning_rate": 3.876798109712041e-06, "loss": 1.0768, "reward": 0.8105051450431346, "reward_std": 0.2675132915383074, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08532816520455526, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8958333358168602, "step": 380 }, { "clip_ratio": 0.0, "completion_length": 40.925, "epoch": 0.4106666666666667, "grad_norm": 13.308245658874512, "kl": 13.184912109375, "learning_rate": 3.837678527055168e-06, "loss": 1.184, "reward": 0.8861061662435532, "reward_std": 0.2109829214246929, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0493104824112379, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9354166686534882, "step": 385 }, { "clip_ratio": 0.0, "completion_length": 34.4875, "epoch": 0.416, "grad_norm": 10.46920108795166, "kl": 12.41220703125, "learning_rate": 3.798094510624037e-06, "loss": 0.9701, "reward": 0.8634556472301483, "reward_std": 0.2479419182986021, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.049044331473123745, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9125000029802323, "step": 390 }, { "clip_ratio": 0.0, "completion_length": 26.58125, "epoch": 0.42133333333333334, "grad_norm": 11.004674911499023, "kl": 12.82890625, "learning_rate": 3.7580598037461933e-06, "loss": 0.9156, "reward": 0.9137032449245452, "reward_std": 0.17160452669631923, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.032130066383979285, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9458333358168602, "step": 395 }, { "epoch": 0.4266666666666667, "grad_norm": 24.072229385375977, "learning_rate": 3.7257168766562506e-06, "loss": 0.7869, "step": 400 }, { "epoch": 0.4266666666666667, "eval_clip_ratio": 0.0, "eval_completion_length": 43.2119, "eval_kl": NaN, "eval_loss": NaN, "eval_reward": 0.8482989487310548, "eval_reward_std": 0.2360066335176363, "eval_rewards/accuracy_reward": 0.00035, "eval_rewards/cosine_scaled_reward": -0.05496769812278835, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9029166707515717, "eval_runtime": 25562.7568, "eval_samples_per_second": 0.196, "eval_steps_per_second": 0.049, "step": 400 }, { "clip_ratio": 0.0, "completion_length": 43.39375, "epoch": 0.432, "grad_norm": 7.45121431350708, "kl": 15.7309326171875, "learning_rate": 3.6849060565546753e-06, "loss": 1.3405, "reward": 0.8567041307687759, "reward_std": 0.20867629193626272, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.054754182432952804, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9114583350718022, "step": 405 }, { "clip_ratio": 0.0, "completion_length": 55.01875, "epoch": 0.43733333333333335, "grad_norm": 21.51198387145996, "kl": 12.4251953125, "learning_rate": 3.6436838443429177e-06, "loss": 1.0869, "reward": 0.7966017562896013, "reward_std": 0.28306140007152863, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06381489653722383, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8604166701436042, "step": 410 }, { "clip_ratio": 0.0, "completion_length": 45.6, "epoch": 0.44266666666666665, "grad_norm": 22.4779109954834, "kl": 10.958740234375, "learning_rate": 3.6020645521200474e-06, "loss": 1.0709, "reward": 0.9217145010828972, "reward_std": 0.1811069515156305, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.04286881822627038, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9583333358168602, "step": 415 }, { "clip_ratio": 0.0, "completion_length": 54.6875, "epoch": 0.448, "grad_norm": 17.699045181274414, "kl": 19.492236328125, "learning_rate": 3.560062629848876e-06, "loss": 1.4715, "reward": 0.8309016443789006, "reward_std": 0.27125840056105516, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.07951500885537825, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9104166716337204, "step": 420 }, { "clip_ratio": 0.0, "completion_length": 43.73125, "epoch": 0.4533333333333333, "grad_norm": 13.910821914672852, "kl": 13.74169921875, "learning_rate": 3.5176926603390176e-06, "loss": 1.0821, "reward": 0.9037241205573082, "reward_std": 0.1520853552130575, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.050442523750825786, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9541666731238365, "step": 425 }, { "clip_ratio": 0.0, "completion_length": 46.25625, "epoch": 0.45866666666666667, "grad_norm": 35.96714782714844, "kl": 15.023095703125, "learning_rate": 3.4749693541838305e-06, "loss": 1.2674, "reward": 0.8611304022371769, "reward_std": 0.2457447752461121, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05970291049015941, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9208333373069764, "step": 430 }, { "clip_ratio": 0.0, "completion_length": 25.18125, "epoch": 0.464, "grad_norm": 23.17320442199707, "kl": 14.875146484375, "learning_rate": 3.4405462708416393e-06, "loss": 0.9348, "reward": 0.9196932911872864, "reward_std": 0.15923782959812344, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.028223346812592354, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9479166686534881, "step": 435 }, { "clip_ratio": 0.0, "completion_length": 38.56875, "epoch": 0.4693333333333333, "grad_norm": 27.985633850097656, "kl": 13.66435546875, "learning_rate": 3.3972244177161966e-06, "loss": 1.0366, "reward": 0.8581299114972353, "reward_std": 0.2213375417979478, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.054370071421726604, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9125000044703484, "step": 440 }, { "clip_ratio": 0.0, "completion_length": 32.3125, "epoch": 0.4746666666666667, "grad_norm": 19.979711532592773, "kl": 12.21240234375, "learning_rate": 3.353591053779859e-06, "loss": 0.9727, "reward": 0.898980014026165, "reward_std": 0.1949043121188879, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03851995818695286, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9375000014901161, "step": 445 }, { "clip_ratio": 0.0, "completion_length": 28.8375, "epoch": 0.48, "grad_norm": 9.39575481414795, "kl": 12.267724609375, "learning_rate": 3.309661328268776e-06, "loss": 0.7964, "reward": 0.8783880487084389, "reward_std": 0.1970566307652007, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04244526417023735, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9208333373069764, "step": 450 }, { "clip_ratio": 0.0, "completion_length": 20.51875, "epoch": 0.48533333333333334, "grad_norm": 7.1221466064453125, "kl": 9.299951171875, "learning_rate": 3.2654504933140165e-06, "loss": 0.6537, "reward": 0.9451988354325295, "reward_std": 0.09344644367556612, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.013134470967634116, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9583333358168602, "step": 455 }, { "clip_ratio": 0.0, "completion_length": 37.9, "epoch": 0.49066666666666664, "grad_norm": 33.304874420166016, "kl": 12.762646484375, "learning_rate": 3.2209738986461186e-06, "loss": 0.9676, "reward": 0.8979712955653667, "reward_std": 0.17766471231188916, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.04161201652896125, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9333333358168602, "step": 460 }, { "clip_ratio": 0.0, "completion_length": 36.25, "epoch": 0.496, "grad_norm": 11.247632026672363, "kl": 12.395263671875, "learning_rate": 3.1762469862657673e-06, "loss": 0.9313, "reward": 0.8783193781971932, "reward_std": 0.21862645422424976, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.044597260178125, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9229166701436042, "step": 465 }, { "clip_ratio": 0.0, "completion_length": 31.64375, "epoch": 0.5013333333333333, "grad_norm": 20.89920997619629, "kl": 11.95849609375, "learning_rate": 3.1312852850824183e-06, "loss": 0.8608, "reward": 0.9111865252256394, "reward_std": 0.14750807457885456, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03673011756764026, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9479166671633721, "step": 470 }, { "clip_ratio": 0.0, "completion_length": 40.10625, "epoch": 0.5066666666666667, "grad_norm": 163.60240173339844, "kl": 13.748583984375, "learning_rate": 3.086104405522758e-06, "loss": 0.9573, "reward": 0.8916466869413853, "reward_std": 0.16286184734963172, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05001995721540879, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9416666686534881, "step": 475 }, { "clip_ratio": 0.0, "completion_length": 75.15625, "epoch": 0.512, "grad_norm": 11.370027542114258, "kl": 17.1302734375, "learning_rate": 3.0407200341108618e-06, "loss": 1.3223, "reward": 0.8070020548999309, "reward_std": 0.2748953197384253, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.10966460229246877, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9166666701436043, "step": 480 }, { "clip_ratio": 0.0, "completion_length": 63.71875, "epoch": 0.5173333333333333, "grad_norm": 11.755677223205566, "kl": 13.574951171875, "learning_rate": 2.995147928021925e-06, "loss": 1.2125, "reward": 0.8644190408289433, "reward_std": 0.19235301127191634, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.08766427135560662, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9520833358168602, "step": 485 }, { "clip_ratio": 0.0, "completion_length": 37.76875, "epoch": 0.5226666666666666, "grad_norm": 9.177205085754395, "kl": 10.011669921875, "learning_rate": 2.9494039096114724e-06, "loss": 0.8227, "reward": 0.894260024279356, "reward_std": 0.16061883713009592, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.045323285380436576, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9395833358168602, "step": 490 }, { "clip_ratio": 0.0, "completion_length": 36.7625, "epoch": 0.528, "grad_norm": 22.728939056396484, "kl": 13.146923828125, "learning_rate": 2.903503860921931e-06, "loss": 1.0093, "reward": 0.8698573663830758, "reward_std": 0.20619451993443363, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04680927444132976, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9166666716337204, "step": 495 }, { "epoch": 0.5333333333333333, "grad_norm": 13.398255348205566, "learning_rate": 2.8574637181684817e-06, "loss": 1.1674, "step": 500 }, { "epoch": 0.5333333333333333, "eval_clip_ratio": 0.0, "eval_completion_length": 28.33905, "eval_kl": 12.18606328125, "eval_loss": 0.8569625020027161, "eval_reward": 0.90754769334288, "eval_reward_std": 0.1603725745135857, "eval_rewards/accuracy_reward": 0.0002, "eval_rewards/cosine_scaled_reward": -0.03205228116786893, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9394000029951334, "eval_runtime": 17575.1409, "eval_samples_per_second": 0.284, "eval_steps_per_second": 0.071, "step": 500 }, { "clip_ratio": 0.0, "completion_length": 32.834375, "epoch": 0.5386666666666666, "grad_norm": 5.583011150360107, "kl": 12.7831787109375, "learning_rate": 2.8112994662061065e-06, "loss": 0.7071, "reward": 0.8887035015970468, "reward_std": 0.17844919376798316, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.041504804241412785, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9302083358168602, "step": 505 }, { "clip_ratio": 0.0, "completion_length": 29.7, "epoch": 0.544, "grad_norm": 21.54783821105957, "kl": 10.3671875, "learning_rate": 2.765027132979743e-06, "loss": 0.7544, "reward": 0.9107995986938476, "reward_std": 0.16975313210086823, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.030867045068589505, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.935416667163372, "step": 510 }, { "clip_ratio": 0.0, "completion_length": 29.76875, "epoch": 0.5493333333333333, "grad_norm": 16.331401824951172, "kl": 9.68544921875, "learning_rate": 2.718662783959478e-06, "loss": 0.7631, "reward": 0.9468030020594597, "reward_std": 0.0817515407301471, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02611364198673982, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.972916667163372, "step": 515 }, { "clip_ratio": 0.0, "completion_length": 32.125, "epoch": 0.5546666666666666, "grad_norm": 15.422439575195312, "kl": 13.121435546875, "learning_rate": 2.672222516562719e-06, "loss": 1.0083, "reward": 0.9191447854042053, "reward_std": 0.15190400344636146, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03502185242396081, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9541666716337204, "step": 520 }, { "clip_ratio": 0.0, "completion_length": 45.1625, "epoch": 0.56, "grad_norm": 9.857662200927734, "kl": 17.79384765625, "learning_rate": 2.6257224545652688e-06, "loss": 1.3274, "reward": 0.8831685408949852, "reward_std": 0.1921324184851983, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05433144455164438, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9375000044703483, "step": 525 }, { "clip_ratio": 0.0, "completion_length": 38.85, "epoch": 0.5653333333333334, "grad_norm": 16.353286743164062, "kl": 13.777734375, "learning_rate": 2.579178742503245e-06, "loss": 0.9957, "reward": 0.9024578690528869, "reward_std": 0.1723116828528873, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.041292104346212, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9437500029802323, "step": 530 }, { "clip_ratio": 0.0, "completion_length": 30.5625, "epoch": 0.5706666666666667, "grad_norm": 8.267068862915039, "kl": 10.66357421875, "learning_rate": 2.5326075400678037e-06, "loss": 0.8638, "reward": 0.9338416069746017, "reward_std": 0.11608097783646372, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.026575039059389384, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9604166716337204, "step": 535 }, { "clip_ratio": 0.0, "completion_length": 43.09375, "epoch": 0.576, "grad_norm": 12.732483863830566, "kl": 14.028271484375, "learning_rate": 2.4860250164945877e-06, "loss": 1.1509, "reward": 0.8834603920578956, "reward_std": 0.20504345865338108, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04778958541719476, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9312500074505806, "step": 540 }, { "clip_ratio": 0.0, "completion_length": 38.90625, "epoch": 0.5813333333333334, "grad_norm": 14.660567283630371, "kl": 15.737451171875, "learning_rate": 2.4394473449498705e-06, "loss": 1.177, "reward": 0.8580375552177429, "reward_std": 0.22333436018479916, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.050295749311044344, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9083333358168602, "step": 545 }, { "clip_ratio": 0.0, "completion_length": 34.6625, "epoch": 0.5866666666666667, "grad_norm": 8.702583312988281, "kl": 12.357177734375, "learning_rate": 2.392890696915329e-06, "loss": 0.9558, "reward": 0.8997138164937496, "reward_std": 0.17692614756524563, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04195282297878293, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9416666716337204, "step": 550 }, { "clip_ratio": 0.0, "completion_length": 49.15625, "epoch": 0.592, "grad_norm": 12.479802131652832, "kl": 16.8791015625, "learning_rate": 2.346371236573409e-06, "loss": 1.2617, "reward": 0.8275570668280124, "reward_std": 0.2324092355556786, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.059942916077852716, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8875000044703484, "step": 555 }, { "clip_ratio": 0.0, "completion_length": 33.6625, "epoch": 0.5973333333333334, "grad_norm": 7.198239803314209, "kl": 12.54541015625, "learning_rate": 2.2999051151952168e-06, "loss": 0.9839, "reward": 0.8674640908837319, "reward_std": 0.2356707454970092, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.042952546622836964, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9104166716337204, "step": 560 }, { "clip_ratio": 0.0, "completion_length": 29.525, "epoch": 0.6026666666666667, "grad_norm": 3.6896111965179443, "kl": 11.279638671875, "learning_rate": 2.2535084655328957e-06, "loss": 0.8347, "reward": 0.9166582852602005, "reward_std": 0.148460166777204, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03542502008058364, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9520833328366279, "step": 565 }, { "clip_ratio": 0.0, "completion_length": 23.55625, "epoch": 0.608, "grad_norm": 10.003271102905273, "kl": 10.087255859375, "learning_rate": 2.2071973962184385e-06, "loss": 0.7287, "reward": 0.928079903870821, "reward_std": 0.12746163122355939, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.019836739538004623, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9479166686534881, "step": 570 }, { "clip_ratio": 0.0, "completion_length": 33.8625, "epoch": 0.6133333333333333, "grad_norm": 8.542618751525879, "kl": 14.265576171875, "learning_rate": 2.1609879861708664e-06, "loss": 1.1148, "reward": 0.8895561441779136, "reward_std": 0.19691728233592584, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03752716149028856, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9270833387970925, "step": 575 }, { "clip_ratio": 0.0, "completion_length": 37.3125, "epoch": 0.6186666666666667, "grad_norm": 14.088128089904785, "kl": 15.01044921875, "learning_rate": 2.1148962790137258e-06, "loss": 1.1356, "reward": 0.8983195193111897, "reward_std": 0.17465938089881092, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.043347125269065145, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9416666686534881, "step": 580 }, { "clip_ratio": 0.0, "completion_length": 32.19375, "epoch": 0.624, "grad_norm": 7.33213472366333, "kl": 12.38037109375, "learning_rate": 2.068938277504842e-06, "loss": 0.89, "reward": 0.9119960308074951, "reward_std": 0.1503613638204115, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.033837280941952486, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9458333358168602, "step": 585 }, { "clip_ratio": 0.0, "completion_length": 33.31875, "epoch": 0.6293333333333333, "grad_norm": 5.882805347442627, "kl": 11.83037109375, "learning_rate": 2.02312993798026e-06, "loss": 0.919, "reward": 0.8715338334441185, "reward_std": 0.21188033148646354, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0367994706160971, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9083333402872086, "step": 590 }, { "clip_ratio": 0.0, "completion_length": 26.5625, "epoch": 0.6346666666666667, "grad_norm": 4.920403003692627, "kl": 10.397021484375, "learning_rate": 1.9774871648143033e-06, "loss": 0.8162, "reward": 0.9242352560162544, "reward_std": 0.14808180312784316, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02368138517922489, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9479166716337204, "step": 595 }, { "epoch": 0.64, "grad_norm": 8.031571388244629, "learning_rate": 1.93202580489767e-06, "loss": 0.9767, "step": 600 }, { "epoch": 0.64, "eval_clip_ratio": 0.0, "eval_completion_length": 24.49845, "eval_kl": 11.000201171875, "eval_loss": 0.762321412563324, "eval_reward": 0.9325542439005221, "eval_reward_std": 0.11938674081818317, "eval_rewards/accuracy_reward": 0.0003, "eval_rewards/cosine_scaled_reward": -0.021479063632985344, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9537333353444933, "eval_runtime": 14835.8257, "eval_samples_per_second": 0.337, "eval_steps_per_second": 0.084, "step": 600 }, { "clip_ratio": 0.0, "completion_length": 25.103125, "epoch": 0.6453333333333333, "grad_norm": 2.8671510219573975, "kl": 11.370947265625, "learning_rate": 1.886761642135495e-06, "loss": 0.5784, "reward": 0.9217620514333248, "reward_std": 0.1400632432058046, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0230295914618182, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9447916686534882, "step": 605 }, { "clip_ratio": 0.0, "completion_length": 27.1625, "epoch": 0.6506666666666666, "grad_norm": 10.607571601867676, "kl": 12.068896484375, "learning_rate": 1.8417103919672686e-06, "loss": 0.8698, "reward": 0.9195777177810669, "reward_std": 0.1432917347177863, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02625558597937925, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9458333343267441, "step": 610 }, { "clip_ratio": 0.0, "completion_length": 26.18125, "epoch": 0.656, "grad_norm": 5.264357089996338, "kl": 10.251806640625, "learning_rate": 1.7968876959105353e-06, "loss": 0.7371, "reward": 0.932566262036562, "reward_std": 0.1156949118234479, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.025767041655490174, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9583333328366279, "step": 615 }, { "clip_ratio": 0.0, "completion_length": 24.19375, "epoch": 0.6613333333333333, "grad_norm": 9.326305389404297, "kl": 11.10224609375, "learning_rate": 1.7523091161302552e-06, "loss": 0.82, "reward": 0.9506021127104759, "reward_std": 0.08566303365714703, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01606452676060144, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9666666686534882, "step": 620 }, { "clip_ratio": 0.0, "completion_length": 28.30625, "epoch": 0.6666666666666666, "grad_norm": 6.24821138381958, "kl": 12.0203125, "learning_rate": 1.707990130035717e-06, "loss": 0.9376, "reward": 0.9201748922467232, "reward_std": 0.14491571187973024, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.029825082910247148, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9500000029802322, "step": 625 }, { "clip_ratio": 0.0, "completion_length": 38.89375, "epoch": 0.672, "grad_norm": 14.636734962463379, "kl": 17.04765625, "learning_rate": 1.6639461249068727e-06, "loss": 1.2657, "reward": 0.878020665794611, "reward_std": 0.22177106849794653, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.049062640547344924, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9270833387970925, "step": 630 }, { "clip_ratio": 0.0, "completion_length": 41.09375, "epoch": 0.6773333333333333, "grad_norm": 14.611737251281738, "kl": 15.438720703125, "learning_rate": 1.6201923925519742e-06, "loss": 1.1666, "reward": 0.8792784817516803, "reward_std": 0.18853381305234507, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04988816333207069, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9291666701436043, "step": 635 }, { "clip_ratio": 0.0, "completion_length": 31.025, "epoch": 0.6826666666666666, "grad_norm": 8.413250923156738, "kl": 9.849609375, "learning_rate": 1.5767441239983433e-06, "loss": 0.7778, "reward": 0.895102259516716, "reward_std": 0.17950981706380845, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.027814385169767773, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9229166671633721, "step": 640 }, { "clip_ratio": 0.0, "completion_length": 29.7, "epoch": 0.688, "grad_norm": 8.036944389343262, "kl": 10.983837890625, "learning_rate": 1.5336164042181495e-06, "loss": 0.847, "reward": 0.9059751465916633, "reward_std": 0.15557526089032764, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03152483354060678, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9375000029802323, "step": 645 }, { "clip_ratio": 0.0, "completion_length": 26.625, "epoch": 0.6933333333333334, "grad_norm": 11.110798835754395, "kl": 11.3498046875, "learning_rate": 1.4908242068909922e-06, "loss": 0.8364, "reward": 0.9294438496232033, "reward_std": 0.12713208887726068, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.024722788939834574, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9541666701436042, "step": 650 }, { "clip_ratio": 0.0, "completion_length": 36.7, "epoch": 0.6986666666666667, "grad_norm": 6.07194185256958, "kl": 14.727978515625, "learning_rate": 1.4483823892051346e-06, "loss": 1.1029, "reward": 0.8648552462458611, "reward_std": 0.22041521333158015, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04347805892175529, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9083333358168602, "step": 655 }, { "clip_ratio": 0.0, "completion_length": 28.825, "epoch": 0.704, "grad_norm": 19.0600528717041, "kl": 12.726025390625, "learning_rate": 1.4063056866991826e-06, "loss": 0.9563, "reward": 0.8969103991985321, "reward_std": 0.18587914234958589, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.032256243082520085, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9291666716337204, "step": 660 }, { "clip_ratio": 0.0, "completion_length": 29.3625, "epoch": 0.7093333333333334, "grad_norm": 8.13565731048584, "kl": 11.763818359375, "learning_rate": 1.3646087081459875e-06, "loss": 0.8517, "reward": 0.9051434069871902, "reward_std": 0.16978331273421646, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.028189902065787465, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.933333334326744, "step": 665 }, { "clip_ratio": 0.0, "completion_length": 33.73125, "epoch": 0.7146666666666667, "grad_norm": 7.444943428039551, "kl": 13.7029296875, "learning_rate": 1.3233059304805798e-06, "loss": 1.0022, "reward": 0.8938254207372666, "reward_std": 0.19537937436252834, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03950788572692545, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.933333334326744, "step": 670 }, { "clip_ratio": 0.0, "completion_length": 40.3125, "epoch": 0.72, "grad_norm": 8.320772171020508, "kl": 14.048828125, "learning_rate": 1.282411693773858e-06, "loss": 1.1004, "reward": 0.8735457874834538, "reward_std": 0.21316041266545654, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04103752294467995, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9145833373069763, "step": 675 }, { "clip_ratio": 0.0, "completion_length": 30.80625, "epoch": 0.7253333333333334, "grad_norm": 9.357903480529785, "kl": 10.77841796875, "learning_rate": 1.2419401962538075e-06, "loss": 0.8574, "reward": 0.9765814572572709, "reward_std": 0.13411221810274582, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.0067518523617764005, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9583333343267441, "step": 680 }, { "clip_ratio": 0.0, "completion_length": 30.9875, "epoch": 0.7306666666666667, "grad_norm": 7.937640190124512, "kl": 10.77626953125, "learning_rate": 1.2019054893759632e-06, "loss": 0.8406, "reward": 0.9284723967313766, "reward_std": 0.12998418211936952, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03611091619386571, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.964583334326744, "step": 685 }, { "clip_ratio": 0.0, "completion_length": 29.71875, "epoch": 0.736, "grad_norm": 6.991804599761963, "kl": 11.34892578125, "learning_rate": 1.1623214729448318e-06, "loss": 0.8146, "reward": 0.9095292709767818, "reward_std": 0.14653535146389912, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.032137370252166876, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9416666701436043, "step": 690 }, { "clip_ratio": 0.0, "completion_length": 33.41875, "epoch": 0.7413333333333333, "grad_norm": 15.745359420776367, "kl": 13.367529296875, "learning_rate": 1.1232018902879603e-06, "loss": 0.9653, "reward": 0.909796753525734, "reward_std": 0.15389935614075512, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0402032266400056, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.95, "step": 695 }, { "epoch": 0.7466666666666667, "grad_norm": 6.5044684410095215, "learning_rate": 1.0845603234843406e-06, "loss": 1.1744, "step": 700 }, { "epoch": 0.7466666666666667, "eval_clip_ratio": 0.0, "eval_completion_length": 39.7183, "eval_kl": 14.359070703125, "eval_loss": 1.0987683534622192, "eval_reward": 0.849645221591182, "eval_reward_std": 0.2433718167852072, "eval_rewards/accuracy_reward": 0.00065, "eval_rewards/cosine_scaled_reward": -0.047671423346560916, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.8966666696608067, "eval_runtime": 24227.8268, "eval_samples_per_second": 0.206, "eval_steps_per_second": 0.052, "step": 700 }, { "clip_ratio": 0.0, "completion_length": 38.634375, "epoch": 0.752, "grad_norm": 6.759803295135498, "kl": 13.7447998046875, "learning_rate": 1.0464101886487958e-06, "loss": 0.9101, "reward": 0.8515742581337691, "reward_std": 0.2163259312044829, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.049467386461037675, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9010416690260172, "step": 705 }, { "clip_ratio": 0.0, "completion_length": 42.8375, "epoch": 0.7573333333333333, "grad_norm": 6.217792510986328, "kl": 13.227294921875, "learning_rate": 1.008764731273985e-06, "loss": 1.0981, "reward": 0.8388586275279521, "reward_std": 0.24302853061817586, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05072467893041903, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8895833402872085, "step": 710 }, { "clip_ratio": 0.0, "completion_length": 37.425, "epoch": 0.7626666666666667, "grad_norm": 4.551347255706787, "kl": 12.595361328125, "learning_rate": 9.716370216316484e-07, "loss": 0.9808, "reward": 0.8465159472078085, "reward_std": 0.23796997629106045, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.047234023729106414, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8937499970197678, "step": 715 }, { "clip_ratio": 0.0, "completion_length": 32.04375, "epoch": 0.768, "grad_norm": 5.863572120666504, "kl": 11.475537109375, "learning_rate": 9.35039950234696e-07, "loss": 0.8703, "reward": 0.9083453208208084, "reward_std": 0.16151572642847895, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.031237985155894422, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9395833358168602, "step": 720 }, { "clip_ratio": 0.0, "completion_length": 35.75625, "epoch": 0.7733333333333333, "grad_norm": 9.391020774841309, "kl": 12.37578125, "learning_rate": 8.98986223361692e-07, "loss": 0.9519, "reward": 0.8641272462904453, "reward_std": 0.23181609474122525, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.037956064224999864, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8958333358168602, "step": 725 }, { "clip_ratio": 0.0, "completion_length": 44.46875, "epoch": 0.7786666666666666, "grad_norm": 8.833009719848633, "kl": 15.742041015625, "learning_rate": 8.634883586453178e-07, "loss": 1.1814, "reward": 0.8054626323282719, "reward_std": 0.2871685145733863, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.06120401412335923, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8666666701436043, "step": 730 }, { "clip_ratio": 0.0, "completion_length": 44.24375, "epoch": 0.784, "grad_norm": 7.315384864807129, "kl": 15.74951171875, "learning_rate": 8.285586807263255e-07, "loss": 1.2603, "reward": 0.8248628986999392, "reward_std": 0.2860633011907339, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.054303745714423715, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8791666693985463, "step": 735 }, { "clip_ratio": 0.0, "completion_length": 37.0375, "epoch": 0.7893333333333333, "grad_norm": 6.141713619232178, "kl": 13.780419921875, "learning_rate": 7.942093169745005e-07, "loss": 1.0372, "reward": 0.8739085428416729, "reward_std": 0.2165603557601571, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04692476779600838, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9208333387970924, "step": 740 }, { "clip_ratio": 0.0, "completion_length": 39.03125, "epoch": 0.7946666666666666, "grad_norm": 8.604565620422363, "kl": 13.617626953125, "learning_rate": 7.604521932781081e-07, "loss": 1.0319, "reward": 0.8153880290687084, "reward_std": 0.2841905845445581, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.045028608468419405, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8604166716337204, "step": 745 }, { "clip_ratio": 0.0, "completion_length": 33.51875, "epoch": 0.8, "grad_norm": 6.989229679107666, "kl": 13.618359375, "learning_rate": 7.272990299033045e-07, "loss": 1.0911, "reward": 0.8738323897123337, "reward_std": 0.2549275178424068, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.036584255800698885, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9041666686534882, "step": 750 }, { "clip_ratio": 0.0, "completion_length": 18.93125, "epoch": 0.8053333333333333, "grad_norm": 3.7775022983551025, "kl": 7.380615234375, "learning_rate": 6.94761337424927e-07, "loss": 0.5148, "reward": 0.9608460694551468, "reward_std": 0.07732116826809943, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.01207056987186661, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.972916667163372, "step": 755 }, { "clip_ratio": 0.0, "completion_length": 35.05, "epoch": 0.8106666666666666, "grad_norm": 6.422189712524414, "kl": 12.621875, "learning_rate": 6.628504127300961e-07, "loss": 0.9193, "reward": 0.9023310661315918, "reward_std": 0.17520240979865775, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03933557301206747, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9416666731238366, "step": 760 }, { "clip_ratio": 0.0, "completion_length": 30.13125, "epoch": 0.816, "grad_norm": 8.22676944732666, "kl": 13.900146484375, "learning_rate": 6.315773350960036e-07, "loss": 1.0414, "reward": 0.8918016396462918, "reward_std": 0.19220082159044977, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03319833040877711, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9250000014901161, "step": 765 }, { "clip_ratio": 0.0, "completion_length": 37.56875, "epoch": 0.8213333333333334, "grad_norm": 10.879252433776855, "kl": 12.575537109375, "learning_rate": 6.009529623432591e-07, "loss": 0.8356, "reward": 0.9034772761166096, "reward_std": 0.19452963769435883, "rewards/accuracy_reward": 0.025, "rewards/cosine_scaled_reward": -0.013189362817502114, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8916666686534882, "step": 770 }, { "clip_ratio": 0.0, "completion_length": 28.9125, "epoch": 0.8266666666666667, "grad_norm": 4.9291253089904785, "kl": 11.142626953125, "learning_rate": 5.70987927066117e-07, "loss": 0.8191, "reward": 0.8983419455587864, "reward_std": 0.16392848258838058, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.026658029100508428, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9250000014901161, "step": 775 }, { "clip_ratio": 0.0, "completion_length": 30.6, "epoch": 0.832, "grad_norm": 6.1532745361328125, "kl": 11.01689453125, "learning_rate": 5.416926329409083e-07, "loss": 0.7916, "reward": 0.9111720651388169, "reward_std": 0.1525883299400448, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03257790798379574, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9437500029802323, "step": 780 }, { "clip_ratio": 0.0, "completion_length": 36.1, "epoch": 0.8373333333333334, "grad_norm": 10.675848960876465, "kl": 14.37998046875, "learning_rate": 5.130772511139456e-07, "loss": 1.1733, "reward": 0.8728187620639801, "reward_std": 0.2474873424973339, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03968120885547251, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9124999985098838, "step": 785 }, { "clip_ratio": 0.0, "completion_length": 37.5, "epoch": 0.8426666666666667, "grad_norm": 6.78824520111084, "kl": 13.61171875, "learning_rate": 4.851517166701658e-07, "loss": 1.0231, "reward": 0.868859538435936, "reward_std": 0.22290587332099676, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04364043357345508, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9125000014901161, "step": 790 }, { "clip_ratio": 0.0, "completion_length": 43.45625, "epoch": 0.848, "grad_norm": 11.661620140075684, "kl": 13.742529296875, "learning_rate": 4.5792572518372714e-07, "loss": 1.0371, "reward": 0.8495261050760746, "reward_std": 0.256103105548209, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.04422387464583153, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8875000014901161, "step": 795 }, { "epoch": 0.8533333333333334, "grad_norm": 11.056589126586914, "learning_rate": 4.3140872935176714e-07, "loss": 0.9557, "step": 800 }, { "epoch": 0.8533333333333334, "eval_clip_ratio": 0.0, "eval_completion_length": 33.62895, "eval_kl": 12.172851171875, "eval_loss": 0.9448180794715881, "eval_reward": 0.884240204228802, "eval_reward_std": 0.19757600214657473, "eval_rewards/accuracy_reward": 0.00055, "eval_rewards/cosine_scaled_reward": -0.03577643865282516, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.9194666695296765, "eval_runtime": 22547.6102, "eval_samples_per_second": 0.222, "eval_steps_per_second": 0.055, "step": 800 }, { "clip_ratio": 0.0, "completion_length": 35.159375, "epoch": 0.8586666666666667, "grad_norm": 14.952364921569824, "kl": 13.27841796875, "learning_rate": 4.0560993571248485e-07, "loss": 1.0932, "reward": 0.8883439194411039, "reward_std": 0.19072621676800736, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.039781055343337354, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9281250014901161, "step": 805 }, { "clip_ratio": 0.0, "completion_length": 35.05, "epoch": 0.864, "grad_norm": 10.145599365234375, "kl": 12.07646484375, "learning_rate": 3.805383014486855e-07, "loss": 0.9373, "reward": 0.8534569308161736, "reward_std": 0.24032519459724427, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.036126373808656354, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8895833373069764, "step": 810 }, { "clip_ratio": 0.0, "completion_length": 23.8625, "epoch": 0.8693333333333333, "grad_norm": 5.662459373474121, "kl": 8.621875, "learning_rate": 3.5620253127790187e-07, "loss": 0.5794, "reward": 0.9346766419708729, "reward_std": 0.09575240558187943, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.02157333122449927, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9562500014901161, "step": 815 }, { "clip_ratio": 0.0, "completion_length": 44.1875, "epoch": 0.8746666666666667, "grad_norm": 5.7309980392456055, "kl": 15.955322265625, "learning_rate": 3.3261107443017054e-07, "loss": 1.2488, "reward": 0.8095948047935962, "reward_std": 0.3085315997945145, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05915517628745874, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8687500044703483, "step": 820 }, { "clip_ratio": 0.0, "completion_length": 34.75625, "epoch": 0.88, "grad_norm": 7.436826229095459, "kl": 12.489697265625, "learning_rate": 3.0977212171451e-07, "loss": 0.9417, "reward": 0.8895582810044289, "reward_std": 0.1721190543845296, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03752502345741959, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9270833358168602, "step": 825 }, { "clip_ratio": 0.0, "completion_length": 31.575, "epoch": 0.8853333333333333, "grad_norm": 5.596876621246338, "kl": 12.599609375, "learning_rate": 2.876936026751234e-07, "loss": 0.9465, "reward": 0.8497903808951378, "reward_std": 0.23422730285674334, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.039792930785915816, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8895833387970924, "step": 830 }, { "clip_ratio": 0.0, "completion_length": 37.8125, "epoch": 0.8906666666666667, "grad_norm": 9.07991886138916, "kl": 15.519482421875, "learning_rate": 2.663831828383098e-07, "loss": 1.2503, "reward": 0.8558540269732475, "reward_std": 0.24497014822754865, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04414595882117282, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9000000014901162, "step": 835 }, { "clip_ratio": 0.0, "completion_length": 31.5625, "epoch": 0.896, "grad_norm": 10.441224098205566, "kl": 12.43076171875, "learning_rate": 2.4584826105103764e-07, "loss": 0.9553, "reward": 0.8752666190266609, "reward_std": 0.2350561751052737, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.035150024328322614, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9104166686534881, "step": 840 }, { "clip_ratio": 0.0, "completion_length": 40.03125, "epoch": 0.9013333333333333, "grad_norm": 15.380375862121582, "kl": 15.1236328125, "learning_rate": 2.2609596691211406e-07, "loss": 1.1801, "reward": 0.8480764515697956, "reward_std": 0.251245317235589, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04984018635150278, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8979166716337204, "step": 845 }, { "clip_ratio": 0.0, "completion_length": 38.30625, "epoch": 0.9066666666666666, "grad_norm": 24.35810661315918, "kl": 14.10478515625, "learning_rate": 2.071331582968289e-07, "loss": 1.0496, "reward": 0.8491456843912601, "reward_std": 0.23082056756447855, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.044604293110023716, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8937500014901161, "step": 850 }, { "clip_ratio": 0.0, "completion_length": 31.91875, "epoch": 0.912, "grad_norm": 11.125753402709961, "kl": 11.45751953125, "learning_rate": 1.889664189759449e-07, "loss": 0.8088, "reward": 0.902073758840561, "reward_std": 0.15496895949763712, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03542621683154721, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9375, "step": 855 }, { "clip_ratio": 0.0, "completion_length": 32.85625, "epoch": 0.9173333333333333, "grad_norm": 10.627705574035645, "kl": 13.045654296875, "learning_rate": 1.7160205632985067e-07, "loss": 0.9879, "reward": 0.8759149216115475, "reward_std": 0.2140658195130527, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03866838661197107, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9145833358168602, "step": 860 }, { "clip_ratio": 0.0, "completion_length": 34.75, "epoch": 0.9226666666666666, "grad_norm": 6.658540725708008, "kl": 13.058203125, "learning_rate": 1.550460991586794e-07, "loss": 1.0528, "reward": 0.8701386474072933, "reward_std": 0.21246134424582125, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03819466594577534, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.908333332836628, "step": 865 }, { "clip_ratio": 0.0, "completion_length": 37.7875, "epoch": 0.928, "grad_norm": 9.237292289733887, "kl": 13.703759765625, "learning_rate": 1.3930429558914492e-07, "loss": 1.0694, "reward": 0.8584538690745831, "reward_std": 0.22779980981722475, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.0415461105396389, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9000000059604645, "step": 870 }, { "clip_ratio": 0.0, "completion_length": 33.2, "epoch": 0.9333333333333333, "grad_norm": 9.896727561950684, "kl": 12.895263671875, "learning_rate": 1.2438211107882654e-07, "loss": 1.0122, "reward": 0.8824942708015442, "reward_std": 0.2223971493065619, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03833904006460216, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9208333358168602, "step": 875 }, { "clip_ratio": 0.0, "completion_length": 35.3625, "epoch": 0.9386666666666666, "grad_norm": 13.891570091247559, "kl": 14.306591796875, "learning_rate": 1.1028472651859829e-07, "loss": 1.076, "reward": 0.8697241485118866, "reward_std": 0.21391889560036362, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03860916146513773, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9083333343267441, "step": 880 }, { "clip_ratio": 0.0, "completion_length": 42.85625, "epoch": 0.944, "grad_norm": 8.301593780517578, "kl": 14.454296875, "learning_rate": 9.701703643385296e-08, "loss": 1.0311, "reward": 0.8556662514805794, "reward_std": 0.21970865479670465, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.05266706076072296, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9083333313465118, "step": 885 }, { "clip_ratio": 0.0, "completion_length": 34.60625, "epoch": 0.9493333333333334, "grad_norm": 8.000605583190918, "kl": 12.25859375, "learning_rate": 8.45836472851544e-08, "loss": 0.8527, "reward": 0.8944506429135799, "reward_std": 0.20130182611646888, "rewards/accuracy_reward": 0.00625, "rewards/cosine_scaled_reward": -0.034716006646340246, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9229166701436042, "step": 890 }, { "clip_ratio": 0.0, "completion_length": 36.0375, "epoch": 0.9546666666666667, "grad_norm": 8.075848579406738, "kl": 13.883349609375, "learning_rate": 7.298887586890207e-08, "loss": 1.0478, "reward": 0.8447876520454883, "reward_std": 0.24192846552468836, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.044795656835776756, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8895833335816861, "step": 895 }, { "epoch": 0.96, "grad_norm": 14.867563247680664, "learning_rate": 6.223674781856593e-08, "loss": 1.0318, "step": 900 }, { "epoch": 0.96, "eval_clip_ratio": 0.0, "eval_completion_length": 36.44145, "eval_kl": 13.474596875, "eval_loss": 1.031240463256836, "eval_reward": 0.8674011821113526, "eval_reward_std": 0.22110519010493734, "eval_rewards/accuracy_reward": 0.00065, "eval_rewards/cosine_scaled_reward": -0.04208212785659125, "eval_rewards/format_reward": 0.0, "eval_rewards/reasoning_steps_reward": 0.908833336481452, "eval_runtime": 22250.8735, "eval_samples_per_second": 0.225, "eval_steps_per_second": 0.056, "step": 900 }, { "clip_ratio": 0.0, "completion_length": 31.6375, "epoch": 0.9653333333333334, "grad_norm": 10.456296920776367, "kl": 12.725634765625, "learning_rate": 5.2330996207010934e-08, "loss": 0.9935, "reward": 0.8905447907745838, "reward_std": 0.19639883563213517, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03237185183097609, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9229166693985462, "step": 905 }, { "clip_ratio": 0.0, "completion_length": 39.18125, "epoch": 0.9706666666666667, "grad_norm": 7.654210567474365, "kl": 14.7568359375, "learning_rate": 4.327506025039785e-08, "loss": 1.2517, "reward": 0.8837279558181763, "reward_std": 0.21061566043645144, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.04335535656136926, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9270833373069763, "step": 910 }, { "clip_ratio": 0.0, "completion_length": 39.51875, "epoch": 0.976, "grad_norm": 11.826396942138672, "kl": 15.272412109375, "learning_rate": 3.5072084114107784e-08, "loss": 1.2212, "reward": 0.854040639102459, "reward_std": 0.25280070770531893, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.048042675899341705, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9020833387970925, "step": 915 }, { "clip_ratio": 0.0, "completion_length": 35.90625, "epoch": 0.9813333333333333, "grad_norm": 9.126459121704102, "kl": 12.9994140625, "learning_rate": 2.772491582110709e-08, "loss": 1.0326, "reward": 0.8983750879764557, "reward_std": 0.17179049158003182, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.043291549726563974, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9416666701436043, "step": 920 }, { "clip_ratio": 0.0, "completion_length": 34.6375, "epoch": 0.9866666666666667, "grad_norm": 7.235086441040039, "kl": 13.3751953125, "learning_rate": 2.1236106263132495e-08, "loss": 1.0079, "reward": 0.8850020661950111, "reward_std": 0.20363401472568513, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03791458437335678, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9229166716337204, "step": 925 }, { "clip_ratio": 0.0, "completion_length": 33.6125, "epoch": 0.992, "grad_norm": 6.8874053955078125, "kl": 12.7154296875, "learning_rate": 1.560790831503567e-08, "loss": 0.9974, "reward": 0.8779860392212868, "reward_std": 0.21407040767371655, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.03243060409004102, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9104166701436043, "step": 930 }, { "clip_ratio": 0.0, "completion_length": 27.25625, "epoch": 0.9973333333333333, "grad_norm": 7.043887138366699, "kl": 11.3630859375, "learning_rate": 1.0842276052599743e-08, "loss": 0.8594, "reward": 0.907631978392601, "reward_std": 0.1686173222726211, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.025701325823320076, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.9333333373069763, "step": 935 }, { "clip_ratio": 0.0, "completion_length": 37.796875, "epoch": 0.9994666666666666, "kl": 14.3427734375, "reward": 0.8153905930766996, "reward_std": 0.2130387331271777, "rewards/accuracy_reward": 0.0, "rewards/cosine_scaled_reward": -0.038776053593210236, "rewards/format_reward": 0.0, "rewards/reasoning_steps_reward": 0.8541666679084301, "step": 937, "total_flos": 0.0, "train_loss": 0.7647797629284884, "train_runtime": 290057.3567, "train_samples_per_second": 0.026, "train_steps_per_second": 0.003 } ], "logging_steps": 5, "max_steps": 937, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }