diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3131 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9994666666666666, + "eval_steps": 100, + "global_step": 937, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completion_length": 195.34375, + "epoch": 0.0010666666666666667, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 0.0, + "loss": -0.0044, + "reward": -0.05581664200872183, + "reward_std": 0.22136071452405304, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.2953999750316143, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2395833432674408, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.265625, + "epoch": 0.005333333333333333, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.034, + "reward": -0.029378876788541675, + "reward_std": 0.2783559260133188, + "rewards/accuracy_reward": 0.015625, + "rewards/cosine_scaled_reward": -0.27677471633069217, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.23177084093913436, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.75625, + "epoch": 0.010666666666666666, + "grad_norm": 0.0, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.0113, + "reward": 0.03780887741595507, + "reward_std": 0.32640064391307533, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.299691129103303, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.3250000080093741, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completion_length": 187.6375, + "epoch": 0.016, + "grad_norm": 2.727306365966797, + "kl": 0.0, + "learning_rate": 5.319148936170213e-08, + "loss": 0.0319, + "reward": 0.02281488720327616, + "reward_std": 0.3450261281337589, + "rewards/accuracy_reward": 0.0375, + "rewards/cosine_scaled_reward": -0.2584351147990674, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2437500048428774, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completion_length": 195.69375, + "epoch": 0.021333333333333333, + "grad_norm": 2.623044013977051, + "kl": -7.709860801696777e-06, + "learning_rate": 3.1914893617021275e-07, + "loss": -0.0101, + "reward": -0.04130282774567604, + "reward_std": 0.2893871849635616, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.29546949565410613, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.24166667126119137, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completion_length": 191.08125, + "epoch": 0.02666666666666667, + "grad_norm": 2.724001884460449, + "kl": 3.969669342041015e-06, + "learning_rate": 5.851063829787235e-07, + "loss": 0.0142, + "reward": 0.07371731325984002, + "reward_std": 0.3645049626007676, + "rewards/accuracy_reward": 0.05625, + "rewards/cosine_scaled_reward": -0.2554493617266417, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.27291667386889457, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.31875, + "epoch": 0.032, + "grad_norm": 2.69883131980896, + "kl": 0.00025610625743865967, + "learning_rate": 8.510638297872341e-07, + "loss": 0.0041, + "reward": -0.03928825343027711, + "reward_std": 0.31049659312702715, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.3247049249708652, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.27916667200624945, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.0125, + "epoch": 0.037333333333333336, + "grad_norm": 2.714967727661133, + "kl": 0.0017081737518310548, + "learning_rate": 1.1170212765957447e-06, + "loss": -0.0011, + "reward": -0.013306560833007098, + "reward_std": 0.2835619566962123, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.3153898956254125, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.2770833408460021, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.4875, + "epoch": 0.042666666666666665, + "grad_norm": 2.374499559402466, + "kl": 0.0033346176147460937, + "learning_rate": 1.3829787234042555e-06, + "loss": 0.0038, + "reward": 0.08598366118967533, + "reward_std": 0.29695698702707884, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.3035996824502945, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.3770833430811763, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completion_length": 198.94375, + "epoch": 0.048, + "grad_norm": 2.4198946952819824, + "kl": 0.007924556732177734, + "learning_rate": 1.648936170212766e-06, + "loss": 0.0057, + "reward": 0.1478586002252996, + "reward_std": 0.251297368388623, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.31672474220395086, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.46458334363996984, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.36875, + "epoch": 0.05333333333333334, + "grad_norm": 2.5116357803344727, + "kl": 0.01243419647216797, + "learning_rate": 1.9148936170212767e-06, + "loss": 0.0035, + "reward": 0.2862342089414597, + "reward_std": 0.2713195723015815, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.29501580335199834, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.5750000137835741, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completion_length": 197.58125, + "epoch": 0.058666666666666666, + "grad_norm": 2.543461561203003, + "kl": 0.020899581909179687, + "learning_rate": 2.1808510638297876e-06, + "loss": -0.0066, + "reward": 0.43956980630755427, + "reward_std": 0.2677960195578635, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.2791802009567618, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.6875000156462192, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completion_length": 199.325, + "epoch": 0.064, + "grad_norm": 2.8129403591156006, + "kl": 0.042889404296875, + "learning_rate": 2.446808510638298e-06, + "loss": -0.004, + "reward": 0.4680780492722988, + "reward_std": 0.18648194698616863, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.33192195519804957, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8000000169500708, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.63125, + "epoch": 0.06933333333333333, + "grad_norm": 2.7215657234191895, + "kl": 0.045915985107421876, + "learning_rate": 2.7127659574468084e-06, + "loss": 0.0162, + "reward": 0.6141613692045211, + "reward_std": 0.23586739597376435, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.269171973131597, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8583333477377891, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completion_length": 194.975, + "epoch": 0.07466666666666667, + "grad_norm": 2.7333619594573975, + "kl": 0.06381301879882813, + "learning_rate": 2.978723404255319e-06, + "loss": -0.005, + "reward": 0.7049825556576252, + "reward_std": 0.2429952388862148, + "rewards/accuracy_reward": 0.04375, + "rewards/cosine_scaled_reward": -0.23876744713634251, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9000000089406968, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completion_length": 188.5, + "epoch": 0.08, + "grad_norm": 3.1279549598693848, + "kl": 0.09346847534179688, + "learning_rate": 3.191489361702128e-06, + "loss": -0.0124, + "reward": 0.6519632238894701, + "reward_std": 0.20673316456377505, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.26470344662666323, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8979166761040688, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completion_length": 186.46875, + "epoch": 0.08533333333333333, + "grad_norm": 2.657968759536743, + "kl": 0.10477294921875, + "learning_rate": 3.457446808510639e-06, + "loss": 0.0064, + "reward": 0.685008542239666, + "reward_std": 0.20272360693197697, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.2337414619512856, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.906250013411045, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completion_length": 181.475, + "epoch": 0.09066666666666667, + "grad_norm": 3.079928159713745, + "kl": 0.13871002197265625, + "learning_rate": 3.723404255319149e-06, + "loss": 0.0185, + "reward": 0.743844810128212, + "reward_std": 0.08398498701862991, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.22073852475732564, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9645833358168602, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completion_length": 184.05, + "epoch": 0.096, + "grad_norm": 3.4379212856292725, + "kl": 0.17229766845703126, + "learning_rate": 3.98936170212766e-06, + "loss": 0.0396, + "reward": 0.799515800178051, + "reward_std": 0.12321573820663616, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.1963175404816866, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9645833358168602, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completion_length": 179.325, + "epoch": 0.10133333333333333, + "grad_norm": 3.069020986557007, + "kl": 0.20455322265625, + "learning_rate": 4.255319148936171e-06, + "loss": 0.0521, + "reward": 0.8741896212100982, + "reward_std": 0.19565205958206205, + "rewards/accuracy_reward": 0.04375, + "rewards/cosine_scaled_reward": -0.14247704413719475, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9729166701436043, + "step": 95 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 5.592884063720703, + "learning_rate": 4.414893617021277e-06, + "loss": 0.0194, + "step": 100 + }, + { + "epoch": 0.10666666666666667, + "eval_clip_ratio": 0.0, + "eval_completion_length": 176.2587, + "eval_kl": 0.447543896484375, + "eval_loss": 0.07140910625457764, + "eval_reward": 0.7571212956573814, + "eval_reward_std": 0.33554665619740265, + "eval_rewards/accuracy_reward": 0.035, + "eval_rewards/cosine_scaled_reward": -0.16731203964566813, + "eval_rewards/format_reward": 0.0034, + "eval_rewards/reasoning_steps_reward": 0.8860333378657699, + "eval_runtime": 45451.5577, + "eval_samples_per_second": 0.11, + "eval_steps_per_second": 0.028, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.865625, + "epoch": 0.112, + "grad_norm": 6.593380451202393, + "kl": 0.4110877990722656, + "learning_rate": 4.680851063829788e-06, + "loss": 0.0897, + "reward": 0.7458049319684505, + "reward_std": 0.2787426192197017, + "rewards/accuracy_reward": 0.015625, + "rewards/cosine_scaled_reward": -0.15940340738743544, + "rewards/format_reward": 0.00625, + "rewards/reasoning_steps_reward": 0.8833333402872086, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completion_length": 164.0125, + "epoch": 0.11733333333333333, + "grad_norm": 8.10860538482666, + "kl": 0.537298583984375, + "learning_rate": 4.946808510638298e-06, + "loss": 0.0605, + "reward": 0.7644045952707529, + "reward_std": 0.3260716760531068, + "rewards/accuracy_reward": 0.0375, + "rewards/cosine_scaled_reward": -0.1293454023078084, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8562500037252903, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completion_length": 139.275, + "epoch": 0.12266666666666666, + "grad_norm": 4.738821029663086, + "kl": 0.5784271240234375, + "learning_rate": 4.9998437598688195e-06, + "loss": 0.0979, + "reward": 0.9231873728334904, + "reward_std": 0.30108860426116735, + "rewards/accuracy_reward": 0.06875, + "rewards/cosine_scaled_reward": -0.07472930029034615, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9291666746139526, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completion_length": 120.325, + "epoch": 0.128, + "grad_norm": 4.913420677185059, + "kl": 0.8223297119140625, + "learning_rate": 4.998889029787758e-06, + "loss": 0.0936, + "reward": 0.9338858745992183, + "reward_std": 0.403242010390386, + "rewards/accuracy_reward": 0.0875, + "rewards/cosine_scaled_reward": -0.022364137368276714, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8687500029802322, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completion_length": 154.9625, + "epoch": 0.13333333333333333, + "grad_norm": 13.690719604492188, + "kl": 2.79224853515625, + "learning_rate": 4.997500548457231e-06, + "loss": 0.208, + "reward": 0.8483377784490586, + "reward_std": 0.41392044560052454, + "rewards/accuracy_reward": 0.08125, + "rewards/cosine_scaled_reward": -0.11207889374345541, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8791666701436043, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.90625, + "epoch": 0.13866666666666666, + "grad_norm": 5.515982151031494, + "kl": 6.988699340820313, + "learning_rate": 4.9955571065548795e-06, + "loss": 0.3533, + "reward": 0.5842950815334916, + "reward_std": 0.38892504015238955, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.21153825148940086, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.7770833354443312, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completion_length": 150.0, + "epoch": 0.144, + "grad_norm": 5.65593147277832, + "kl": 0.9134048461914063, + "learning_rate": 4.992348060495989e-06, + "loss": 0.0899, + "reward": 0.979816447198391, + "reward_std": 0.3385909158969298, + "rewards/accuracy_reward": 0.10625, + "rewards/cosine_scaled_reward": -0.06810021735727786, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9416666716337204, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completion_length": 168.94375, + "epoch": 0.14933333333333335, + "grad_norm": 12.159282684326172, + "kl": 1.6743255615234376, + "learning_rate": 4.9882736864879e-06, + "loss": 0.1391, + "reward": 0.8389136493206024, + "reward_std": 0.3920943819917738, + "rewards/accuracy_reward": 0.08125, + "rewards/cosine_scaled_reward": -0.1631696756929159, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9208333402872085, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completion_length": 169.19375, + "epoch": 0.15466666666666667, + "grad_norm": 14.845711708068848, + "kl": 1.3467193603515626, + "learning_rate": 4.983335399128258e-06, + "loss": 0.1243, + "reward": 0.8044680153485387, + "reward_std": 0.2859855240676552, + "rewards/accuracy_reward": 0.05, + "rewards/cosine_scaled_reward": -0.1351153214694932, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8895833373069764, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completion_length": 160.75625, + "epoch": 0.16, + "grad_norm": 18.14019012451172, + "kl": 1.78900146484375, + "learning_rate": 4.977534912960124e-06, + "loss": 0.1092, + "reward": 0.8079865228617564, + "reward_std": 0.31241205376572906, + "rewards/accuracy_reward": 0.04375, + "rewards/cosine_scaled_reward": -0.11076348531059921, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8750000029802323, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completion_length": 144.0125, + "epoch": 0.16533333333333333, + "grad_norm": 20.268800735473633, + "kl": 3.7872955322265627, + "learning_rate": 4.970874241876697e-06, + "loss": 0.2331, + "reward": 0.6273253193552364, + "reward_std": 0.35022516273102156, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.13309134985704532, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.7416666714474559, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completion_length": 143.4125, + "epoch": 0.17066666666666666, + "grad_norm": 11.427456855773926, + "kl": 3.261151123046875, + "learning_rate": 4.963355698422092e-06, + "loss": 0.193, + "reward": 0.6995385489520232, + "reward_std": 0.3108037303014498, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.1004614585451236, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.7750000052154065, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.35, + "epoch": 0.176, + "grad_norm": 5.5330586433410645, + "kl": 2.344598388671875, + "learning_rate": 4.954981892988451e-06, + "loss": 0.2493, + "reward": 0.8669513031840325, + "reward_std": 0.20902994847856463, + "rewards/accuracy_reward": 0.03125, + "rewards/cosine_scaled_reward": -0.103882029466331, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9395833387970924, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completion_length": 145.88125, + "epoch": 0.18133333333333335, + "grad_norm": 17.83176040649414, + "kl": 2.1351898193359373, + "learning_rate": 4.945755732909625e-06, + "loss": 0.2572, + "reward": 0.8633685514330864, + "reward_std": 0.1485096547054127, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.10954811349511147, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.960416667163372, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completion_length": 123.525, + "epoch": 0.18666666666666668, + "grad_norm": 10.693355560302734, + "kl": 3.7545166015625, + "learning_rate": 4.935680421451764e-06, + "loss": 0.2588, + "reward": 0.8673717919737101, + "reward_std": 0.2996328216511756, + "rewards/accuracy_reward": 0.0375, + "rewards/cosine_scaled_reward": -0.0555448760278523, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8854166723787784, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.0375, + "epoch": 0.192, + "grad_norm": 8.319129943847656, + "kl": 2.70517578125, + "learning_rate": 4.924759456701167e-06, + "loss": 0.2177, + "reward": 0.8939216539263726, + "reward_std": 0.30960728515638036, + "rewards/accuracy_reward": 0.0375, + "rewards/cosine_scaled_reward": -0.020661678398028016, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8770833417773247, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completion_length": 93.4625, + "epoch": 0.19733333333333333, + "grad_norm": 9.506866455078125, + "kl": 3.8556793212890623, + "learning_rate": 4.912996630349765e-06, + "loss": 0.2313, + "reward": 0.8351011492311955, + "reward_std": 0.26371640426805243, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.04614884976763278, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8625000029802322, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completion_length": 99.05, + "epoch": 0.20266666666666666, + "grad_norm": 10.067543983459473, + "kl": 2.504437255859375, + "learning_rate": 4.900396026378671e-06, + "loss": 0.2384, + "reward": 0.9122196048498153, + "reward_std": 0.17188128359848634, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.046113729919306935, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9458333417773247, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completion_length": 115.89375, + "epoch": 0.208, + "grad_norm": 14.831565856933594, + "kl": 6.83585205078125, + "learning_rate": 4.886962019640244e-06, + "loss": 0.5534, + "reward": 0.8909050643444061, + "reward_std": 0.13241582510527222, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.08409493574872613, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9687500029802323, + "step": 195 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 13.112170219421387, + "learning_rate": 4.872699274339169e-06, + "loss": 0.4056, + "step": 200 + }, + { + "epoch": 0.21333333333333335, + "eval_clip_ratio": 0.0, + "eval_completion_length": 104.26975, + "eval_kl": NaN, + "eval_loss": NaN, + "eval_reward": 0.8497997302287249, + "eval_reward_std": 0.22811159837578293, + "eval_rewards/accuracy_reward": 0.01425, + "eval_rewards/cosine_scaled_reward": -0.08245027167908939, + "eval_rewards/format_reward": 0.0, + "eval_rewards/reasoning_steps_reward": 0.9180000054657459, + "eval_runtime": 36604.6644, + "eval_samples_per_second": 0.137, + "eval_steps_per_second": 0.034, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completion_length": 101.375, + "epoch": 0.21866666666666668, + "grad_norm": 13.828842163085938, + "kl": 4.618588256835937, + "learning_rate": 4.857612742413072e-06, + "loss": 0.45, + "reward": 0.8754412285983563, + "reward_std": 0.22013528265815693, + "rewards/accuracy_reward": 0.015625, + "rewards/cosine_scaled_reward": -0.0776837759069167, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9375000052154064, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completion_length": 119.96875, + "epoch": 0.224, + "grad_norm": 9.70358657836914, + "kl": 6.018792724609375, + "learning_rate": 4.8417076618132434e-06, + "loss": 0.4681, + "reward": 0.7821227680891752, + "reward_std": 0.23853054337669163, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.12204390410333872, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8979166731238365, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completion_length": 124.28125, + "epoch": 0.22933333333333333, + "grad_norm": 25.299379348754883, + "kl": 4.7243896484375, + "learning_rate": 4.824989554686043e-06, + "loss": 0.3798, + "reward": 0.8066352348774671, + "reward_std": 0.267948625725694, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.10794810801744462, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8958333440124988, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completion_length": 121.075, + "epoch": 0.23466666666666666, + "grad_norm": 7.767666816711426, + "kl": 3.927545166015625, + "learning_rate": 4.807464225455655e-06, + "loss": 0.3358, + "reward": 0.8636085368692875, + "reward_std": 0.26568231563433076, + "rewards/accuracy_reward": 0.0375, + "rewards/cosine_scaled_reward": -0.07389147193171083, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8999999985098839, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completion_length": 132.05625, + "epoch": 0.24, + "grad_norm": 11.310062408447266, + "kl": 5.49139404296875, + "learning_rate": 4.789137758808823e-06, + "loss": 0.432, + "reward": 0.7667079947888851, + "reward_std": 0.25788455196889115, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.1312086760997772, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8916666679084301, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completion_length": 100.15, + "epoch": 0.24533333333333332, + "grad_norm": 7.714837074279785, + "kl": 2.090936279296875, + "learning_rate": 4.770016517582283e-06, + "loss": 0.2939, + "reward": 0.9156014438718557, + "reward_std": 0.33299890445778146, + "rewards/accuracy_reward": 0.05, + "rewards/cosine_scaled_reward": -0.03231522748246789, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8979166761040688, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completion_length": 116.16875, + "epoch": 0.25066666666666665, + "grad_norm": 29.340322494506836, + "kl": 8.26297607421875, + "learning_rate": 4.750107140553627e-06, + "loss": 0.5621, + "reward": 0.7611351676285267, + "reward_std": 0.3555757596914191, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.11594817549921572, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8583333417773247, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completion_length": 110.64375, + "epoch": 0.256, + "grad_norm": 14.472433090209961, + "kl": 6.403857421875, + "learning_rate": 4.7294165401363616e-06, + "loss": 0.5554, + "reward": 0.7904055327177048, + "reward_std": 0.28852546858834105, + "rewards/accuracy_reward": 0.0125, + "rewards/cosine_scaled_reward": -0.1158444695873186, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8937500074505806, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completion_length": 82.34375, + "epoch": 0.2613333333333333, + "grad_norm": 13.236370086669922, + "kl": 6.80526123046875, + "learning_rate": 4.712306397324877e-06, + "loss": 0.5688, + "reward": 0.8986638426780701, + "reward_std": 0.2791194328689016, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.049252831703051926, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9229166701436042, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completion_length": 96.94375, + "epoch": 0.26666666666666666, + "grad_norm": 17.91639518737793, + "kl": 11.293544006347656, + "learning_rate": 4.69022787828549e-06, + "loss": 0.8827, + "reward": 0.7485023282468319, + "reward_std": 0.35306237193290146, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.10983100975863636, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8583333387970924, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.75625, + "epoch": 0.272, + "grad_norm": 16.45020866394043, + "kl": 5.82073974609375, + "learning_rate": 4.66738892556983e-06, + "loss": 0.5283, + "reward": 0.8968394428491593, + "reward_std": 0.24545104982098565, + "rewards/accuracy_reward": 0.01875, + "rewards/cosine_scaled_reward": -0.046910554519854486, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9250000029802322, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completion_length": 77.80625, + "epoch": 0.2773333333333333, + "grad_norm": 32.926937103271484, + "kl": 9.4225341796875, + "learning_rate": 4.643797468722099e-06, + "loss": 0.8073, + "reward": 0.8601903270930051, + "reward_std": 0.18378369295678568, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0877263396163471, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9479166686534881, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completion_length": 81.775, + "epoch": 0.2826666666666667, + "grad_norm": 12.122540473937988, + "kl": 9.3671630859375, + "learning_rate": 4.6194616985513144e-06, + "loss": 0.7643, + "reward": 0.8126994274556637, + "reward_std": 0.26541943780030125, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.09563390930416063, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9020833432674408, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.30625, + "epoch": 0.288, + "grad_norm": 14.13431453704834, + "kl": 7.41337890625, + "learning_rate": 4.594390064287515e-06, + "loss": 0.6638, + "reward": 0.8327964015305043, + "reward_std": 0.2086354006532929, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06512026621494442, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8979166716337204, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.6625, + "epoch": 0.29333333333333333, + "grad_norm": 19.133230209350586, + "kl": 8.81510009765625, + "learning_rate": 4.568591270648233e-06, + "loss": 0.7184, + "reward": 0.8030373096466065, + "reward_std": 0.26814147859986404, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06571269998094068, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8687500089406968, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completion_length": 65.8875, + "epoch": 0.2986666666666667, + "grad_norm": 20.222061157226562, + "kl": 9.532666015625, + "learning_rate": 4.5420742748162735e-06, + "loss": 0.786, + "reward": 0.8763291284441947, + "reward_std": 0.17602166483411566, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06533753902185709, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9416666701436043, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completion_length": 89.26875, + "epoch": 0.304, + "grad_norm": 11.2631254196167, + "kl": 11.497705078125, + "learning_rate": 4.514848283329835e-06, + "loss": 0.9101, + "reward": 0.8311819508671761, + "reward_std": 0.2350232223427156, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.10215137323830277, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9333333358168602, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completion_length": 78.51875, + "epoch": 0.30933333333333335, + "grad_norm": 22.054162979125977, + "kl": 9.7431640625, + "learning_rate": 4.486922748886054e-06, + "loss": 0.8153, + "reward": 0.8432315267622471, + "reward_std": 0.226219168829266, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07551847096183337, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9187500044703484, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completion_length": 62.14375, + "epoch": 0.31466666666666665, + "grad_norm": 21.374130249023438, + "kl": 10.1951416015625, + "learning_rate": 4.458307367059092e-06, + "loss": 0.9005, + "reward": 0.8722259551286697, + "reward_std": 0.19159141974596422, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05902404521766584, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9312499985098839, + "step": 295 + }, + { + "epoch": 0.32, + "grad_norm": 24.809528350830078, + "learning_rate": 4.4290120729338835e-06, + "loss": 1.1358, + "step": 300 + }, + { + "epoch": 0.32, + "eval_clip_ratio": 0.0, + "eval_completion_length": 65.7632, + "eval_kl": Infinity, + "eval_loss": 1.0562268495559692, + "eval_reward": 0.7877680493371532, + "eval_reward_std": 0.28633810927099196, + "eval_rewards/accuracy_reward": 0.0008, + "eval_rewards/cosine_scaled_reward": -0.08163195236857573, + "eval_rewards/format_reward": 0.0, + "eval_rewards/reasoning_steps_reward": 0.8686000056952238, + "eval_runtime": 31706.4113, + "eval_samples_per_second": 0.158, + "eval_steps_per_second": 0.039, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completion_length": 66.99375, + "epoch": 0.3253333333333333, + "grad_norm": 16.53313636779785, + "kl": 13.25738525390625, + "learning_rate": 4.399047037656741e-06, + "loss": 1.0307, + "reward": 0.7880359996110201, + "reward_std": 0.296626356554043, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08279733196541202, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8708333365619183, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completion_length": 47.05625, + "epoch": 0.33066666666666666, + "grad_norm": 21.8547306060791, + "kl": 9.9548583984375, + "learning_rate": 4.368422664903997e-06, + "loss": 0.8694, + "reward": 0.882577420771122, + "reward_std": 0.18155750810219615, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05075591259810608, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9333333373069763, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.09375, + "epoch": 0.336, + "grad_norm": 32.71742248535156, + "kl": 13.6111328125, + "learning_rate": 4.3371495872699044e-06, + "loss": 1.0447, + "reward": 0.8353129029273987, + "reward_std": 0.24553734397513835, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06052043429663172, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8958333328366279, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.075, + "epoch": 0.3413333333333333, + "grad_norm": 17.508634567260742, + "kl": 9.9728515625, + "learning_rate": 4.305238662575073e-06, + "loss": 0.8065, + "reward": 0.8548699423670769, + "reward_std": 0.19330476764516788, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.047213390382239595, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.902083334326744, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.6125, + "epoch": 0.3466666666666667, + "grad_norm": 15.830941200256348, + "kl": 12.16015625, + "learning_rate": 4.272700970096696e-06, + "loss": 1.0147, + "reward": 0.7913525246083737, + "reward_std": 0.26694968109495676, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.058647475835459775, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8500000044703484, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.8875, + "epoch": 0.352, + "grad_norm": 14.494447708129883, + "kl": 14.19609375, + "learning_rate": 4.239547806721892e-06, + "loss": 1.1633, + "reward": 0.76911461353302, + "reward_std": 0.318796195685718, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0558853830647422, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8250000014901161, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.3625, + "epoch": 0.35733333333333334, + "grad_norm": 21.269908905029297, + "kl": 11.915087890625, + "learning_rate": 4.2057906830255006e-06, + "loss": 0.9898, + "reward": 0.8679051876068116, + "reward_std": 0.2277104783368486, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03626146233000327, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9041666701436043, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completion_length": 18.35625, + "epoch": 0.3626666666666667, + "grad_norm": 1.002107858657837, + "kl": 8.1767578125, + "learning_rate": 4.1714413192736756e-06, + "loss": 0.52, + "reward": 0.9522343382239342, + "reward_std": 0.09452941585068401, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.012348967575235292, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9645833373069763, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completion_length": 27.9375, + "epoch": 0.368, + "grad_norm": 16.14938735961914, + "kl": 11.773193359375, + "learning_rate": 4.1365116413546835e-06, + "loss": 0.8212, + "reward": 0.899377702921629, + "reward_std": 0.16825042173995824, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03187227531598182, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9312500044703483, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completion_length": 52.64375, + "epoch": 0.37333333333333335, + "grad_norm": 12.938347816467285, + "kl": 16.27109375, + "learning_rate": 4.101013776638309e-06, + "loss": 1.218, + "reward": 0.8322627246379852, + "reward_std": 0.24702412509959687, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06565392787888413, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8979166731238365, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completion_length": 48.01875, + "epoch": 0.37866666666666665, + "grad_norm": 23.911727905273438, + "kl": 10.50517578125, + "learning_rate": 4.064960049765304e-06, + "loss": 1.0027, + "reward": 0.8219170615077018, + "reward_std": 0.28035222916250857, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04891626287571853, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8708333402872086, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.28125, + "epoch": 0.384, + "grad_norm": 7.875117301940918, + "kl": 11.235986328125, + "learning_rate": 4.028362978368352e-06, + "loss": 0.9222, + "reward": 0.8919128350913524, + "reward_std": 0.20153675045185082, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03517048052453901, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9270833373069763, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.05625, + "epoch": 0.3893333333333333, + "grad_norm": 41.62312316894531, + "kl": 15.83330078125, + "learning_rate": 3.991235268726016e-06, + "loss": 1.048, + "reward": 0.8823194235563279, + "reward_std": 0.17005571061081354, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04684721886005718, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.929166667163372, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.98125, + "epoch": 0.39466666666666667, + "grad_norm": 6.0660810470581055, + "kl": 16.304296875, + "learning_rate": 3.9535898113512046e-06, + "loss": 1.1516, + "reward": 0.8508514143526554, + "reward_std": 0.23275069000383156, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06789856371178757, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.91875, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completion_length": 50.45625, + "epoch": 0.4, + "grad_norm": 21.067644119262695, + "kl": 12.8263671875, + "learning_rate": 3.91543967651566e-06, + "loss": 1.0909, + "reward": 0.824290581792593, + "reward_std": 0.2590523644972563, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07987606586975744, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.904166667163372, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completion_length": 56.78125, + "epoch": 0.4053333333333333, + "grad_norm": 36.3121452331543, + "kl": 13.694140625, + "learning_rate": 3.876798109712041e-06, + "loss": 1.0768, + "reward": 0.8105051450431346, + "reward_std": 0.2675132915383074, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08532816520455526, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8958333358168602, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.925, + "epoch": 0.4106666666666667, + "grad_norm": 13.308245658874512, + "kl": 13.184912109375, + "learning_rate": 3.837678527055168e-06, + "loss": 1.184, + "reward": 0.8861061662435532, + "reward_std": 0.2109829214246929, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0493104824112379, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9354166686534882, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.4875, + "epoch": 0.416, + "grad_norm": 10.46920108795166, + "kl": 12.41220703125, + "learning_rate": 3.798094510624037e-06, + "loss": 0.9701, + "reward": 0.8634556472301483, + "reward_std": 0.2479419182986021, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.049044331473123745, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9125000029802323, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completion_length": 26.58125, + "epoch": 0.42133333333333334, + "grad_norm": 11.004674911499023, + "kl": 12.82890625, + "learning_rate": 3.7580598037461933e-06, + "loss": 0.9156, + "reward": 0.9137032449245452, + "reward_std": 0.17160452669631923, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.032130066383979285, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9458333358168602, + "step": 395 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 24.072229385375977, + "learning_rate": 3.7257168766562506e-06, + "loss": 0.7869, + "step": 400 + }, + { + "epoch": 0.4266666666666667, + "eval_clip_ratio": 0.0, + "eval_completion_length": 43.2119, + "eval_kl": NaN, + "eval_loss": NaN, + "eval_reward": 0.8482989487310548, + "eval_reward_std": 0.2360066335176363, + "eval_rewards/accuracy_reward": 0.00035, + "eval_rewards/cosine_scaled_reward": -0.05496769812278835, + "eval_rewards/format_reward": 0.0, + "eval_rewards/reasoning_steps_reward": 0.9029166707515717, + "eval_runtime": 25562.7568, + "eval_samples_per_second": 0.196, + "eval_steps_per_second": 0.049, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.39375, + "epoch": 0.432, + "grad_norm": 7.45121431350708, + "kl": 15.7309326171875, + "learning_rate": 3.6849060565546753e-06, + "loss": 1.3405, + "reward": 0.8567041307687759, + "reward_std": 0.20867629193626272, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.054754182432952804, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9114583350718022, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completion_length": 55.01875, + "epoch": 0.43733333333333335, + "grad_norm": 21.51198387145996, + "kl": 12.4251953125, + "learning_rate": 3.6436838443429177e-06, + "loss": 1.0869, + "reward": 0.7966017562896013, + "reward_std": 0.28306140007152863, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06381489653722383, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8604166701436042, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.6, + "epoch": 0.44266666666666665, + "grad_norm": 22.4779109954834, + "kl": 10.958740234375, + "learning_rate": 3.6020645521200474e-06, + "loss": 1.0709, + "reward": 0.9217145010828972, + "reward_std": 0.1811069515156305, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.04286881822627038, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9583333358168602, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completion_length": 54.6875, + "epoch": 0.448, + "grad_norm": 17.699045181274414, + "kl": 19.492236328125, + "learning_rate": 3.560062629848876e-06, + "loss": 1.4715, + "reward": 0.8309016443789006, + "reward_std": 0.27125840056105516, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.07951500885537825, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9104166716337204, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.73125, + "epoch": 0.4533333333333333, + "grad_norm": 13.910821914672852, + "kl": 13.74169921875, + "learning_rate": 3.5176926603390176e-06, + "loss": 1.0821, + "reward": 0.9037241205573082, + "reward_std": 0.1520853552130575, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.050442523750825786, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9541666731238365, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completion_length": 46.25625, + "epoch": 0.45866666666666667, + "grad_norm": 35.96714782714844, + "kl": 15.023095703125, + "learning_rate": 3.4749693541838305e-06, + "loss": 1.2674, + "reward": 0.8611304022371769, + "reward_std": 0.2457447752461121, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05970291049015941, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9208333373069764, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completion_length": 25.18125, + "epoch": 0.464, + "grad_norm": 23.17320442199707, + "kl": 14.875146484375, + "learning_rate": 3.4405462708416393e-06, + "loss": 0.9348, + "reward": 0.9196932911872864, + "reward_std": 0.15923782959812344, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.028223346812592354, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9479166686534881, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.56875, + "epoch": 0.4693333333333333, + "grad_norm": 27.985633850097656, + "kl": 13.66435546875, + "learning_rate": 3.3972244177161966e-06, + "loss": 1.0366, + "reward": 0.8581299114972353, + "reward_std": 0.2213375417979478, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.054370071421726604, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9125000044703484, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.3125, + "epoch": 0.4746666666666667, + "grad_norm": 19.979711532592773, + "kl": 12.21240234375, + "learning_rate": 3.353591053779859e-06, + "loss": 0.9727, + "reward": 0.898980014026165, + "reward_std": 0.1949043121188879, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03851995818695286, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9375000014901161, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.8375, + "epoch": 0.48, + "grad_norm": 9.39575481414795, + "kl": 12.267724609375, + "learning_rate": 3.309661328268776e-06, + "loss": 0.7964, + "reward": 0.8783880487084389, + "reward_std": 0.1970566307652007, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04244526417023735, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9208333373069764, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completion_length": 20.51875, + "epoch": 0.48533333333333334, + "grad_norm": 7.1221466064453125, + "kl": 9.299951171875, + "learning_rate": 3.2654504933140165e-06, + "loss": 0.6537, + "reward": 0.9451988354325295, + "reward_std": 0.09344644367556612, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.013134470967634116, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9583333358168602, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.9, + "epoch": 0.49066666666666664, + "grad_norm": 33.304874420166016, + "kl": 12.762646484375, + "learning_rate": 3.2209738986461186e-06, + "loss": 0.9676, + "reward": 0.8979712955653667, + "reward_std": 0.17766471231188916, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.04161201652896125, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9333333358168602, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.25, + "epoch": 0.496, + "grad_norm": 11.247632026672363, + "kl": 12.395263671875, + "learning_rate": 3.1762469862657673e-06, + "loss": 0.9313, + "reward": 0.8783193781971932, + "reward_std": 0.21862645422424976, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.044597260178125, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9229166701436042, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.64375, + "epoch": 0.5013333333333333, + "grad_norm": 20.89920997619629, + "kl": 11.95849609375, + "learning_rate": 3.1312852850824183e-06, + "loss": 0.8608, + "reward": 0.9111865252256394, + "reward_std": 0.14750807457885456, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03673011756764026, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9479166671633721, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.10625, + "epoch": 0.5066666666666667, + "grad_norm": 163.60240173339844, + "kl": 13.748583984375, + "learning_rate": 3.086104405522758e-06, + "loss": 0.9573, + "reward": 0.8916466869413853, + "reward_std": 0.16286184734963172, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05001995721540879, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9416666686534881, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completion_length": 75.15625, + "epoch": 0.512, + "grad_norm": 11.370027542114258, + "kl": 17.1302734375, + "learning_rate": 3.0407200341108618e-06, + "loss": 1.3223, + "reward": 0.8070020548999309, + "reward_std": 0.2748953197384253, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.10966460229246877, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9166666701436043, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completion_length": 63.71875, + "epoch": 0.5173333333333333, + "grad_norm": 11.755677223205566, + "kl": 13.574951171875, + "learning_rate": 2.995147928021925e-06, + "loss": 1.2125, + "reward": 0.8644190408289433, + "reward_std": 0.19235301127191634, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.08766427135560662, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9520833358168602, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.76875, + "epoch": 0.5226666666666666, + "grad_norm": 9.177205085754395, + "kl": 10.011669921875, + "learning_rate": 2.9494039096114724e-06, + "loss": 0.8227, + "reward": 0.894260024279356, + "reward_std": 0.16061883713009592, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.045323285380436576, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9395833358168602, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.7625, + "epoch": 0.528, + "grad_norm": 22.728939056396484, + "kl": 13.146923828125, + "learning_rate": 2.903503860921931e-06, + "loss": 1.0093, + "reward": 0.8698573663830758, + "reward_std": 0.20619451993443363, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04680927444132976, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9166666716337204, + "step": 495 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 13.398255348205566, + "learning_rate": 2.8574637181684817e-06, + "loss": 1.1674, + "step": 500 + }, + { + "epoch": 0.5333333333333333, + "eval_clip_ratio": 0.0, + "eval_completion_length": 28.33905, + "eval_kl": 12.18606328125, + "eval_loss": 0.8569625020027161, + "eval_reward": 0.90754769334288, + "eval_reward_std": 0.1603725745135857, + "eval_rewards/accuracy_reward": 0.0002, + "eval_rewards/cosine_scaled_reward": -0.03205228116786893, + "eval_rewards/format_reward": 0.0, + "eval_rewards/reasoning_steps_reward": 0.9394000029951334, + "eval_runtime": 17575.1409, + "eval_samples_per_second": 0.284, + "eval_steps_per_second": 0.071, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.834375, + "epoch": 0.5386666666666666, + "grad_norm": 5.583011150360107, + "kl": 12.7831787109375, + "learning_rate": 2.8112994662061065e-06, + "loss": 0.7071, + "reward": 0.8887035015970468, + "reward_std": 0.17844919376798316, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.041504804241412785, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9302083358168602, + "step": 505 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.7, + "epoch": 0.544, + "grad_norm": 21.54783821105957, + "kl": 10.3671875, + "learning_rate": 2.765027132979743e-06, + "loss": 0.7544, + "reward": 0.9107995986938476, + "reward_std": 0.16975313210086823, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.030867045068589505, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.935416667163372, + "step": 510 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.76875, + "epoch": 0.5493333333333333, + "grad_norm": 16.331401824951172, + "kl": 9.68544921875, + "learning_rate": 2.718662783959478e-06, + "loss": 0.7631, + "reward": 0.9468030020594597, + "reward_std": 0.0817515407301471, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.02611364198673982, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.972916667163372, + "step": 515 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.125, + "epoch": 0.5546666666666666, + "grad_norm": 15.422439575195312, + "kl": 13.121435546875, + "learning_rate": 2.672222516562719e-06, + "loss": 1.0083, + "reward": 0.9191447854042053, + "reward_std": 0.15190400344636146, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03502185242396081, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9541666716337204, + "step": 520 + }, + { + "clip_ratio": 0.0, + "completion_length": 45.1625, + "epoch": 0.56, + "grad_norm": 9.857662200927734, + "kl": 17.79384765625, + "learning_rate": 2.6257224545652688e-06, + "loss": 1.3274, + "reward": 0.8831685408949852, + "reward_std": 0.1921324184851983, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05433144455164438, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9375000044703483, + "step": 525 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.85, + "epoch": 0.5653333333333334, + "grad_norm": 16.353286743164062, + "kl": 13.777734375, + "learning_rate": 2.579178742503245e-06, + "loss": 0.9957, + "reward": 0.9024578690528869, + "reward_std": 0.1723116828528873, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.041292104346212, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9437500029802323, + "step": 530 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.5625, + "epoch": 0.5706666666666667, + "grad_norm": 8.267068862915039, + "kl": 10.66357421875, + "learning_rate": 2.5326075400678037e-06, + "loss": 0.8638, + "reward": 0.9338416069746017, + "reward_std": 0.11608097783646372, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.026575039059389384, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9604166716337204, + "step": 535 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.09375, + "epoch": 0.576, + "grad_norm": 12.732483863830566, + "kl": 14.028271484375, + "learning_rate": 2.4860250164945877e-06, + "loss": 1.1509, + "reward": 0.8834603920578956, + "reward_std": 0.20504345865338108, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04778958541719476, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9312500074505806, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.90625, + "epoch": 0.5813333333333334, + "grad_norm": 14.660567283630371, + "kl": 15.737451171875, + "learning_rate": 2.4394473449498705e-06, + "loss": 1.177, + "reward": 0.8580375552177429, + "reward_std": 0.22333436018479916, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.050295749311044344, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9083333358168602, + "step": 545 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6625, + "epoch": 0.5866666666666667, + "grad_norm": 8.702583312988281, + "kl": 12.357177734375, + "learning_rate": 2.392890696915329e-06, + "loss": 0.9558, + "reward": 0.8997138164937496, + "reward_std": 0.17692614756524563, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04195282297878293, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9416666716337204, + "step": 550 + }, + { + "clip_ratio": 0.0, + "completion_length": 49.15625, + "epoch": 0.592, + "grad_norm": 12.479802131652832, + "kl": 16.8791015625, + "learning_rate": 2.346371236573409e-06, + "loss": 1.2617, + "reward": 0.8275570668280124, + "reward_std": 0.2324092355556786, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.059942916077852716, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8875000044703484, + "step": 555 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.6625, + "epoch": 0.5973333333333334, + "grad_norm": 7.198239803314209, + "kl": 12.54541015625, + "learning_rate": 2.2999051151952168e-06, + "loss": 0.9839, + "reward": 0.8674640908837319, + "reward_std": 0.2356707454970092, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.042952546622836964, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9104166716337204, + "step": 560 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.525, + "epoch": 0.6026666666666667, + "grad_norm": 3.6896111965179443, + "kl": 11.279638671875, + "learning_rate": 2.2535084655328957e-06, + "loss": 0.8347, + "reward": 0.9166582852602005, + "reward_std": 0.148460166777204, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03542502008058364, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9520833328366279, + "step": 565 + }, + { + "clip_ratio": 0.0, + "completion_length": 23.55625, + "epoch": 0.608, + "grad_norm": 10.003271102905273, + "kl": 10.087255859375, + "learning_rate": 2.2071973962184385e-06, + "loss": 0.7287, + "reward": 0.928079903870821, + "reward_std": 0.12746163122355939, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.019836739538004623, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9479166686534881, + "step": 570 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.8625, + "epoch": 0.6133333333333333, + "grad_norm": 8.542618751525879, + "kl": 14.265576171875, + "learning_rate": 2.1609879861708664e-06, + "loss": 1.1148, + "reward": 0.8895561441779136, + "reward_std": 0.19691728233592584, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03752716149028856, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9270833387970925, + "step": 575 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.3125, + "epoch": 0.6186666666666667, + "grad_norm": 14.088128089904785, + "kl": 15.01044921875, + "learning_rate": 2.1148962790137258e-06, + "loss": 1.1356, + "reward": 0.8983195193111897, + "reward_std": 0.17465938089881092, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.043347125269065145, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9416666686534881, + "step": 580 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.19375, + "epoch": 0.624, + "grad_norm": 7.33213472366333, + "kl": 12.38037109375, + "learning_rate": 2.068938277504842e-06, + "loss": 0.89, + "reward": 0.9119960308074951, + "reward_std": 0.1503613638204115, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.033837280941952486, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9458333358168602, + "step": 585 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.31875, + "epoch": 0.6293333333333333, + "grad_norm": 5.882805347442627, + "kl": 11.83037109375, + "learning_rate": 2.02312993798026e-06, + "loss": 0.919, + "reward": 0.8715338334441185, + "reward_std": 0.21188033148646354, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0367994706160971, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9083333402872086, + "step": 590 + }, + { + "clip_ratio": 0.0, + "completion_length": 26.5625, + "epoch": 0.6346666666666667, + "grad_norm": 4.920403003692627, + "kl": 10.397021484375, + "learning_rate": 1.9774871648143033e-06, + "loss": 0.8162, + "reward": 0.9242352560162544, + "reward_std": 0.14808180312784316, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.02368138517922489, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9479166716337204, + "step": 595 + }, + { + "epoch": 0.64, + "grad_norm": 8.031571388244629, + "learning_rate": 1.93202580489767e-06, + "loss": 0.9767, + "step": 600 + }, + { + "epoch": 0.64, + "eval_clip_ratio": 0.0, + "eval_completion_length": 24.49845, + "eval_kl": 11.000201171875, + "eval_loss": 0.762321412563324, + "eval_reward": 0.9325542439005221, + "eval_reward_std": 0.11938674081818317, + "eval_rewards/accuracy_reward": 0.0003, + "eval_rewards/cosine_scaled_reward": -0.021479063632985344, + "eval_rewards/format_reward": 0.0, + "eval_rewards/reasoning_steps_reward": 0.9537333353444933, + "eval_runtime": 14835.8257, + "eval_samples_per_second": 0.337, + "eval_steps_per_second": 0.084, + "step": 600 + }, + { + "clip_ratio": 0.0, + "completion_length": 25.103125, + "epoch": 0.6453333333333333, + "grad_norm": 2.8671510219573975, + "kl": 11.370947265625, + "learning_rate": 1.886761642135495e-06, + "loss": 0.5784, + "reward": 0.9217620514333248, + "reward_std": 0.1400632432058046, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0230295914618182, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9447916686534882, + "step": 605 + }, + { + "clip_ratio": 0.0, + "completion_length": 27.1625, + "epoch": 0.6506666666666666, + "grad_norm": 10.607571601867676, + "kl": 12.068896484375, + "learning_rate": 1.8417103919672686e-06, + "loss": 0.8698, + "reward": 0.9195777177810669, + "reward_std": 0.1432917347177863, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.02625558597937925, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9458333343267441, + "step": 610 + }, + { + "clip_ratio": 0.0, + "completion_length": 26.18125, + "epoch": 0.656, + "grad_norm": 5.264357089996338, + "kl": 10.251806640625, + "learning_rate": 1.7968876959105353e-06, + "loss": 0.7371, + "reward": 0.932566262036562, + "reward_std": 0.1156949118234479, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.025767041655490174, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9583333328366279, + "step": 615 + }, + { + "clip_ratio": 0.0, + "completion_length": 24.19375, + "epoch": 0.6613333333333333, + "grad_norm": 9.326305389404297, + "kl": 11.10224609375, + "learning_rate": 1.7523091161302552e-06, + "loss": 0.82, + "reward": 0.9506021127104759, + "reward_std": 0.08566303365714703, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.01606452676060144, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9666666686534882, + "step": 620 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.30625, + "epoch": 0.6666666666666666, + "grad_norm": 6.24821138381958, + "kl": 12.0203125, + "learning_rate": 1.707990130035717e-06, + "loss": 0.9376, + "reward": 0.9201748922467232, + "reward_std": 0.14491571187973024, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.029825082910247148, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9500000029802322, + "step": 625 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.89375, + "epoch": 0.672, + "grad_norm": 14.636734962463379, + "kl": 17.04765625, + "learning_rate": 1.6639461249068727e-06, + "loss": 1.2657, + "reward": 0.878020665794611, + "reward_std": 0.22177106849794653, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.049062640547344924, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9270833387970925, + "step": 630 + }, + { + "clip_ratio": 0.0, + "completion_length": 41.09375, + "epoch": 0.6773333333333333, + "grad_norm": 14.611737251281738, + "kl": 15.438720703125, + "learning_rate": 1.6201923925519742e-06, + "loss": 1.1666, + "reward": 0.8792784817516803, + "reward_std": 0.18853381305234507, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04988816333207069, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9291666701436043, + "step": 635 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.025, + "epoch": 0.6826666666666666, + "grad_norm": 8.413250923156738, + "kl": 9.849609375, + "learning_rate": 1.5767441239983433e-06, + "loss": 0.7778, + "reward": 0.895102259516716, + "reward_std": 0.17950981706380845, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.027814385169767773, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9229166671633721, + "step": 640 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.7, + "epoch": 0.688, + "grad_norm": 8.036944389343262, + "kl": 10.983837890625, + "learning_rate": 1.5336164042181495e-06, + "loss": 0.847, + "reward": 0.9059751465916633, + "reward_std": 0.15557526089032764, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03152483354060678, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9375000029802323, + "step": 645 + }, + { + "clip_ratio": 0.0, + "completion_length": 26.625, + "epoch": 0.6933333333333334, + "grad_norm": 11.110798835754395, + "kl": 11.3498046875, + "learning_rate": 1.4908242068909922e-06, + "loss": 0.8364, + "reward": 0.9294438496232033, + "reward_std": 0.12713208887726068, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.024722788939834574, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9541666701436042, + "step": 650 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.7, + "epoch": 0.6986666666666667, + "grad_norm": 6.07194185256958, + "kl": 14.727978515625, + "learning_rate": 1.4483823892051346e-06, + "loss": 1.1029, + "reward": 0.8648552462458611, + "reward_std": 0.22041521333158015, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04347805892175529, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9083333358168602, + "step": 655 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.825, + "epoch": 0.704, + "grad_norm": 19.0600528717041, + "kl": 12.726025390625, + "learning_rate": 1.4063056866991826e-06, + "loss": 0.9563, + "reward": 0.8969103991985321, + "reward_std": 0.18587914234958589, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.032256243082520085, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9291666716337204, + "step": 660 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.3625, + "epoch": 0.7093333333333334, + "grad_norm": 8.13565731048584, + "kl": 11.763818359375, + "learning_rate": 1.3646087081459875e-06, + "loss": 0.8517, + "reward": 0.9051434069871902, + "reward_std": 0.16978331273421646, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.028189902065787465, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.933333334326744, + "step": 665 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.73125, + "epoch": 0.7146666666666667, + "grad_norm": 7.444943428039551, + "kl": 13.7029296875, + "learning_rate": 1.3233059304805798e-06, + "loss": 1.0022, + "reward": 0.8938254207372666, + "reward_std": 0.19537937436252834, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03950788572692545, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.933333334326744, + "step": 670 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.3125, + "epoch": 0.72, + "grad_norm": 8.320772171020508, + "kl": 14.048828125, + "learning_rate": 1.282411693773858e-06, + "loss": 1.1004, + "reward": 0.8735457874834538, + "reward_std": 0.21316041266545654, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04103752294467995, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9145833373069763, + "step": 675 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.80625, + "epoch": 0.7253333333333334, + "grad_norm": 9.357903480529785, + "kl": 10.77841796875, + "learning_rate": 1.2419401962538075e-06, + "loss": 0.8574, + "reward": 0.9765814572572709, + "reward_std": 0.13411221810274582, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.0067518523617764005, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9583333343267441, + "step": 680 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.9875, + "epoch": 0.7306666666666667, + "grad_norm": 7.937640190124512, + "kl": 10.77626953125, + "learning_rate": 1.2019054893759632e-06, + "loss": 0.8406, + "reward": 0.9284723967313766, + "reward_std": 0.12998418211936952, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03611091619386571, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.964583334326744, + "step": 685 + }, + { + "clip_ratio": 0.0, + "completion_length": 29.71875, + "epoch": 0.736, + "grad_norm": 6.991804599761963, + "kl": 11.34892578125, + "learning_rate": 1.1623214729448318e-06, + "loss": 0.8146, + "reward": 0.9095292709767818, + "reward_std": 0.14653535146389912, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.032137370252166876, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9416666701436043, + "step": 690 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.41875, + "epoch": 0.7413333333333333, + "grad_norm": 15.745359420776367, + "kl": 13.367529296875, + "learning_rate": 1.1232018902879603e-06, + "loss": 0.9653, + "reward": 0.909796753525734, + "reward_std": 0.15389935614075512, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0402032266400056, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.95, + "step": 695 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 6.5044684410095215, + "learning_rate": 1.0845603234843406e-06, + "loss": 1.1744, + "step": 700 + }, + { + "epoch": 0.7466666666666667, + "eval_clip_ratio": 0.0, + "eval_completion_length": 39.7183, + "eval_kl": 14.359070703125, + "eval_loss": 1.0987683534622192, + "eval_reward": 0.849645221591182, + "eval_reward_std": 0.2433718167852072, + "eval_rewards/accuracy_reward": 0.00065, + "eval_rewards/cosine_scaled_reward": -0.047671423346560916, + "eval_rewards/format_reward": 0.0, + "eval_rewards/reasoning_steps_reward": 0.8966666696608067, + "eval_runtime": 24227.8268, + "eval_samples_per_second": 0.206, + "eval_steps_per_second": 0.052, + "step": 700 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.634375, + "epoch": 0.752, + "grad_norm": 6.759803295135498, + "kl": 13.7447998046875, + "learning_rate": 1.0464101886487958e-06, + "loss": 0.9101, + "reward": 0.8515742581337691, + "reward_std": 0.2163259312044829, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.049467386461037675, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9010416690260172, + "step": 705 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.8375, + "epoch": 0.7573333333333333, + "grad_norm": 6.217792510986328, + "kl": 13.227294921875, + "learning_rate": 1.008764731273985e-06, + "loss": 1.0981, + "reward": 0.8388586275279521, + "reward_std": 0.24302853061817586, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05072467893041903, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8895833402872085, + "step": 710 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.425, + "epoch": 0.7626666666666667, + "grad_norm": 4.551347255706787, + "kl": 12.595361328125, + "learning_rate": 9.716370216316484e-07, + "loss": 0.9808, + "reward": 0.8465159472078085, + "reward_std": 0.23796997629106045, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.047234023729106414, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8937499970197678, + "step": 715 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.04375, + "epoch": 0.768, + "grad_norm": 5.863572120666504, + "kl": 11.475537109375, + "learning_rate": 9.35039950234696e-07, + "loss": 0.8703, + "reward": 0.9083453208208084, + "reward_std": 0.16151572642847895, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.031237985155894422, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9395833358168602, + "step": 720 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.75625, + "epoch": 0.7733333333333333, + "grad_norm": 9.391020774841309, + "kl": 12.37578125, + "learning_rate": 8.98986223361692e-07, + "loss": 0.9519, + "reward": 0.8641272462904453, + "reward_std": 0.23181609474122525, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.037956064224999864, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8958333358168602, + "step": 725 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.46875, + "epoch": 0.7786666666666666, + "grad_norm": 8.833009719848633, + "kl": 15.742041015625, + "learning_rate": 8.634883586453178e-07, + "loss": 1.1814, + "reward": 0.8054626323282719, + "reward_std": 0.2871685145733863, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.06120401412335923, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8666666701436043, + "step": 730 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.24375, + "epoch": 0.784, + "grad_norm": 7.315384864807129, + "kl": 15.74951171875, + "learning_rate": 8.285586807263255e-07, + "loss": 1.2603, + "reward": 0.8248628986999392, + "reward_std": 0.2860633011907339, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.054303745714423715, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8791666693985463, + "step": 735 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.0375, + "epoch": 0.7893333333333333, + "grad_norm": 6.141713619232178, + "kl": 13.780419921875, + "learning_rate": 7.942093169745005e-07, + "loss": 1.0372, + "reward": 0.8739085428416729, + "reward_std": 0.2165603557601571, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04692476779600838, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9208333387970924, + "step": 740 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.03125, + "epoch": 0.7946666666666666, + "grad_norm": 8.604565620422363, + "kl": 13.617626953125, + "learning_rate": 7.604521932781081e-07, + "loss": 1.0319, + "reward": 0.8153880290687084, + "reward_std": 0.2841905845445581, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.045028608468419405, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8604166716337204, + "step": 745 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.51875, + "epoch": 0.8, + "grad_norm": 6.989229679107666, + "kl": 13.618359375, + "learning_rate": 7.272990299033045e-07, + "loss": 1.0911, + "reward": 0.8738323897123337, + "reward_std": 0.2549275178424068, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.036584255800698885, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9041666686534882, + "step": 750 + }, + { + "clip_ratio": 0.0, + "completion_length": 18.93125, + "epoch": 0.8053333333333333, + "grad_norm": 3.7775022983551025, + "kl": 7.380615234375, + "learning_rate": 6.94761337424927e-07, + "loss": 0.5148, + "reward": 0.9608460694551468, + "reward_std": 0.07732116826809943, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.01207056987186661, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.972916667163372, + "step": 755 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.05, + "epoch": 0.8106666666666666, + "grad_norm": 6.422189712524414, + "kl": 12.621875, + "learning_rate": 6.628504127300961e-07, + "loss": 0.9193, + "reward": 0.9023310661315918, + "reward_std": 0.17520240979865775, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03933557301206747, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9416666731238366, + "step": 760 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.13125, + "epoch": 0.816, + "grad_norm": 8.22676944732666, + "kl": 13.900146484375, + "learning_rate": 6.315773350960036e-07, + "loss": 1.0414, + "reward": 0.8918016396462918, + "reward_std": 0.19220082159044977, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03319833040877711, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9250000014901161, + "step": 765 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.56875, + "epoch": 0.8213333333333334, + "grad_norm": 10.879252433776855, + "kl": 12.575537109375, + "learning_rate": 6.009529623432591e-07, + "loss": 0.8356, + "reward": 0.9034772761166096, + "reward_std": 0.19452963769435883, + "rewards/accuracy_reward": 0.025, + "rewards/cosine_scaled_reward": -0.013189362817502114, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8916666686534882, + "step": 770 + }, + { + "clip_ratio": 0.0, + "completion_length": 28.9125, + "epoch": 0.8266666666666667, + "grad_norm": 4.9291253089904785, + "kl": 11.142626953125, + "learning_rate": 5.70987927066117e-07, + "loss": 0.8191, + "reward": 0.8983419455587864, + "reward_std": 0.16392848258838058, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.026658029100508428, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9250000014901161, + "step": 775 + }, + { + "clip_ratio": 0.0, + "completion_length": 30.6, + "epoch": 0.832, + "grad_norm": 6.1532745361328125, + "kl": 11.01689453125, + "learning_rate": 5.416926329409083e-07, + "loss": 0.7916, + "reward": 0.9111720651388169, + "reward_std": 0.1525883299400448, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03257790798379574, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9437500029802323, + "step": 780 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.1, + "epoch": 0.8373333333333334, + "grad_norm": 10.675848960876465, + "kl": 14.37998046875, + "learning_rate": 5.130772511139456e-07, + "loss": 1.1733, + "reward": 0.8728187620639801, + "reward_std": 0.2474873424973339, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03968120885547251, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9124999985098838, + "step": 785 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.5, + "epoch": 0.8426666666666667, + "grad_norm": 6.78824520111084, + "kl": 13.61171875, + "learning_rate": 4.851517166701658e-07, + "loss": 1.0231, + "reward": 0.868859538435936, + "reward_std": 0.22290587332099676, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04364043357345508, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9125000014901161, + "step": 790 + }, + { + "clip_ratio": 0.0, + "completion_length": 43.45625, + "epoch": 0.848, + "grad_norm": 11.661620140075684, + "kl": 13.742529296875, + "learning_rate": 4.5792572518372714e-07, + "loss": 1.0371, + "reward": 0.8495261050760746, + "reward_std": 0.256103105548209, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.04422387464583153, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8875000014901161, + "step": 795 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 11.056589126586914, + "learning_rate": 4.3140872935176714e-07, + "loss": 0.9557, + "step": 800 + }, + { + "epoch": 0.8533333333333334, + "eval_clip_ratio": 0.0, + "eval_completion_length": 33.62895, + "eval_kl": 12.172851171875, + "eval_loss": 0.9448180794715881, + "eval_reward": 0.884240204228802, + "eval_reward_std": 0.19757600214657473, + "eval_rewards/accuracy_reward": 0.00055, + "eval_rewards/cosine_scaled_reward": -0.03577643865282516, + "eval_rewards/format_reward": 0.0, + "eval_rewards/reasoning_steps_reward": 0.9194666695296765, + "eval_runtime": 22547.6102, + "eval_samples_per_second": 0.222, + "eval_steps_per_second": 0.055, + "step": 800 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.159375, + "epoch": 0.8586666666666667, + "grad_norm": 14.952364921569824, + "kl": 13.27841796875, + "learning_rate": 4.0560993571248485e-07, + "loss": 1.0932, + "reward": 0.8883439194411039, + "reward_std": 0.19072621676800736, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.039781055343337354, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9281250014901161, + "step": 805 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.05, + "epoch": 0.864, + "grad_norm": 10.145599365234375, + "kl": 12.07646484375, + "learning_rate": 3.805383014486855e-07, + "loss": 0.9373, + "reward": 0.8534569308161736, + "reward_std": 0.24032519459724427, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.036126373808656354, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8895833373069764, + "step": 810 + }, + { + "clip_ratio": 0.0, + "completion_length": 23.8625, + "epoch": 0.8693333333333333, + "grad_norm": 5.662459373474121, + "kl": 8.621875, + "learning_rate": 3.5620253127790187e-07, + "loss": 0.5794, + "reward": 0.9346766419708729, + "reward_std": 0.09575240558187943, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.02157333122449927, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9562500014901161, + "step": 815 + }, + { + "clip_ratio": 0.0, + "completion_length": 44.1875, + "epoch": 0.8746666666666667, + "grad_norm": 5.7309980392456055, + "kl": 15.955322265625, + "learning_rate": 3.3261107443017054e-07, + "loss": 1.2488, + "reward": 0.8095948047935962, + "reward_std": 0.3085315997945145, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05915517628745874, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8687500044703483, + "step": 820 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.75625, + "epoch": 0.88, + "grad_norm": 7.436826229095459, + "kl": 12.489697265625, + "learning_rate": 3.0977212171451e-07, + "loss": 0.9417, + "reward": 0.8895582810044289, + "reward_std": 0.1721190543845296, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03752502345741959, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9270833358168602, + "step": 825 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.575, + "epoch": 0.8853333333333333, + "grad_norm": 5.596876621246338, + "kl": 12.599609375, + "learning_rate": 2.876936026751234e-07, + "loss": 0.9465, + "reward": 0.8497903808951378, + "reward_std": 0.23422730285674334, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.039792930785915816, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8895833387970924, + "step": 830 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.8125, + "epoch": 0.8906666666666667, + "grad_norm": 9.07991886138916, + "kl": 15.519482421875, + "learning_rate": 2.663831828383098e-07, + "loss": 1.2503, + "reward": 0.8558540269732475, + "reward_std": 0.24497014822754865, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04414595882117282, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9000000014901162, + "step": 835 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.5625, + "epoch": 0.896, + "grad_norm": 10.441224098205566, + "kl": 12.43076171875, + "learning_rate": 2.4584826105103764e-07, + "loss": 0.9553, + "reward": 0.8752666190266609, + "reward_std": 0.2350561751052737, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.035150024328322614, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9104166686534881, + "step": 840 + }, + { + "clip_ratio": 0.0, + "completion_length": 40.03125, + "epoch": 0.9013333333333333, + "grad_norm": 15.380375862121582, + "kl": 15.1236328125, + "learning_rate": 2.2609596691211406e-07, + "loss": 1.1801, + "reward": 0.8480764515697956, + "reward_std": 0.251245317235589, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04984018635150278, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8979166716337204, + "step": 845 + }, + { + "clip_ratio": 0.0, + "completion_length": 38.30625, + "epoch": 0.9066666666666666, + "grad_norm": 24.35810661315918, + "kl": 14.10478515625, + "learning_rate": 2.071331582968289e-07, + "loss": 1.0496, + "reward": 0.8491456843912601, + "reward_std": 0.23082056756447855, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.044604293110023716, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8937500014901161, + "step": 850 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.91875, + "epoch": 0.912, + "grad_norm": 11.125753402709961, + "kl": 11.45751953125, + "learning_rate": 1.889664189759449e-07, + "loss": 0.8088, + "reward": 0.902073758840561, + "reward_std": 0.15496895949763712, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03542621683154721, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9375, + "step": 855 + }, + { + "clip_ratio": 0.0, + "completion_length": 32.85625, + "epoch": 0.9173333333333333, + "grad_norm": 10.627705574035645, + "kl": 13.045654296875, + "learning_rate": 1.7160205632985067e-07, + "loss": 0.9879, + "reward": 0.8759149216115475, + "reward_std": 0.2140658195130527, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03866838661197107, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9145833358168602, + "step": 860 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.75, + "epoch": 0.9226666666666666, + "grad_norm": 6.658540725708008, + "kl": 13.058203125, + "learning_rate": 1.550460991586794e-07, + "loss": 1.0528, + "reward": 0.8701386474072933, + "reward_std": 0.21246134424582125, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03819466594577534, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.908333332836628, + "step": 865 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.7875, + "epoch": 0.928, + "grad_norm": 9.237292289733887, + "kl": 13.703759765625, + "learning_rate": 1.3930429558914492e-07, + "loss": 1.0694, + "reward": 0.8584538690745831, + "reward_std": 0.22779980981722475, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.0415461105396389, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9000000059604645, + "step": 870 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.2, + "epoch": 0.9333333333333333, + "grad_norm": 9.896727561950684, + "kl": 12.895263671875, + "learning_rate": 1.2438211107882654e-07, + "loss": 1.0122, + "reward": 0.8824942708015442, + "reward_std": 0.2223971493065619, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03833904006460216, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9208333358168602, + "step": 875 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.3625, + "epoch": 0.9386666666666666, + "grad_norm": 13.891570091247559, + "kl": 14.306591796875, + "learning_rate": 1.1028472651859829e-07, + "loss": 1.076, + "reward": 0.8697241485118866, + "reward_std": 0.21391889560036362, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03860916146513773, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9083333343267441, + "step": 880 + }, + { + "clip_ratio": 0.0, + "completion_length": 42.85625, + "epoch": 0.944, + "grad_norm": 8.301593780517578, + "kl": 14.454296875, + "learning_rate": 9.701703643385296e-08, + "loss": 1.0311, + "reward": 0.8556662514805794, + "reward_std": 0.21970865479670465, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.05266706076072296, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9083333313465118, + "step": 885 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.60625, + "epoch": 0.9493333333333334, + "grad_norm": 8.000605583190918, + "kl": 12.25859375, + "learning_rate": 8.45836472851544e-08, + "loss": 0.8527, + "reward": 0.8944506429135799, + "reward_std": 0.20130182611646888, + "rewards/accuracy_reward": 0.00625, + "rewards/cosine_scaled_reward": -0.034716006646340246, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9229166701436042, + "step": 890 + }, + { + "clip_ratio": 0.0, + "completion_length": 36.0375, + "epoch": 0.9546666666666667, + "grad_norm": 8.075848579406738, + "kl": 13.883349609375, + "learning_rate": 7.298887586890207e-08, + "loss": 1.0478, + "reward": 0.8447876520454883, + "reward_std": 0.24192846552468836, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.044795656835776756, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8895833335816861, + "step": 895 + }, + { + "epoch": 0.96, + "grad_norm": 14.867563247680664, + "learning_rate": 6.223674781856593e-08, + "loss": 1.0318, + "step": 900 + }, + { + "epoch": 0.96, + "eval_clip_ratio": 0.0, + "eval_completion_length": 36.44145, + "eval_kl": 13.474596875, + "eval_loss": 1.031240463256836, + "eval_reward": 0.8674011821113526, + "eval_reward_std": 0.22110519010493734, + "eval_rewards/accuracy_reward": 0.00065, + "eval_rewards/cosine_scaled_reward": -0.04208212785659125, + "eval_rewards/format_reward": 0.0, + "eval_rewards/reasoning_steps_reward": 0.908833336481452, + "eval_runtime": 22250.8735, + "eval_samples_per_second": 0.225, + "eval_steps_per_second": 0.056, + "step": 900 + }, + { + "clip_ratio": 0.0, + "completion_length": 31.6375, + "epoch": 0.9653333333333334, + "grad_norm": 10.456296920776367, + "kl": 12.725634765625, + "learning_rate": 5.2330996207010934e-08, + "loss": 0.9935, + "reward": 0.8905447907745838, + "reward_std": 0.19639883563213517, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03237185183097609, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9229166693985462, + "step": 905 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.18125, + "epoch": 0.9706666666666667, + "grad_norm": 7.654210567474365, + "kl": 14.7568359375, + "learning_rate": 4.327506025039785e-08, + "loss": 1.2517, + "reward": 0.8837279558181763, + "reward_std": 0.21061566043645144, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.04335535656136926, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9270833373069763, + "step": 910 + }, + { + "clip_ratio": 0.0, + "completion_length": 39.51875, + "epoch": 0.976, + "grad_norm": 11.826396942138672, + "kl": 15.272412109375, + "learning_rate": 3.5072084114107784e-08, + "loss": 1.2212, + "reward": 0.854040639102459, + "reward_std": 0.25280070770531893, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.048042675899341705, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9020833387970925, + "step": 915 + }, + { + "clip_ratio": 0.0, + "completion_length": 35.90625, + "epoch": 0.9813333333333333, + "grad_norm": 9.126459121704102, + "kl": 12.9994140625, + "learning_rate": 2.772491582110709e-08, + "loss": 1.0326, + "reward": 0.8983750879764557, + "reward_std": 0.17179049158003182, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.043291549726563974, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9416666701436043, + "step": 920 + }, + { + "clip_ratio": 0.0, + "completion_length": 34.6375, + "epoch": 0.9866666666666667, + "grad_norm": 7.235086441040039, + "kl": 13.3751953125, + "learning_rate": 2.1236106263132495e-08, + "loss": 1.0079, + "reward": 0.8850020661950111, + "reward_std": 0.20363401472568513, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03791458437335678, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9229166716337204, + "step": 925 + }, + { + "clip_ratio": 0.0, + "completion_length": 33.6125, + "epoch": 0.992, + "grad_norm": 6.8874053955078125, + "kl": 12.7154296875, + "learning_rate": 1.560790831503567e-08, + "loss": 0.9974, + "reward": 0.8779860392212868, + "reward_std": 0.21407040767371655, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.03243060409004102, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9104166701436043, + "step": 930 + }, + { + "clip_ratio": 0.0, + "completion_length": 27.25625, + "epoch": 0.9973333333333333, + "grad_norm": 7.043887138366699, + "kl": 11.3630859375, + "learning_rate": 1.0842276052599743e-08, + "loss": 0.8594, + "reward": 0.907631978392601, + "reward_std": 0.1686173222726211, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.025701325823320076, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.9333333373069763, + "step": 935 + }, + { + "clip_ratio": 0.0, + "completion_length": 37.796875, + "epoch": 0.9994666666666666, + "kl": 14.3427734375, + "reward": 0.8153905930766996, + "reward_std": 0.2130387331271777, + "rewards/accuracy_reward": 0.0, + "rewards/cosine_scaled_reward": -0.038776053593210236, + "rewards/format_reward": 0.0, + "rewards/reasoning_steps_reward": 0.8541666679084301, + "step": 937, + "total_flos": 0.0, + "train_loss": 0.7647797629284884, + "train_runtime": 290057.3567, + "train_samples_per_second": 0.026, + "train_steps_per_second": 0.003 + } + ], + "logging_steps": 5, + "max_steps": 937, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}