diff --git "a/checkpoint-400/trainer_state.json" "b/checkpoint-400/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-400/trainer_state.json" @@ -0,0 +1,10034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9070294784580499, + "eval_steps": 500, + "global_step": 400, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1626.0, + "completions/max_terminated_length": 1626.0, + "completions/mean_length": 762.0625, + "completions/mean_terminated_length": 762.0625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.0022675736961451248, + "grad_norm": 0.006846296135336161, + "kl": 0.0, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 225040.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.0346051454544067, + "rewards/thinking_verbosity_reward/std": 0.12779560685157776, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2746.0, + "completions/max_terminated_length": 2746.0, + "completions/mean_length": 829.37890625, + "completions/mean_terminated_length": 829.37890625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.0045351473922902496, + "grad_norm": 0.007818465121090412, + "kl": 0.000205961536266841, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 472097.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0030031204223633, + "rewards/thinking_verbosity_reward/std": 0.22048026323318481, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2132.0, + "completions/mean_length": 876.359375, + "completions/mean_terminated_length": 859.0708618164062, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "epoch": 0.006802721088435374, + "grad_norm": 0.006037730723619461, + "kl": -5.6747885537333786e-05, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 727229.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.0274438858032227, + "rewards/thinking_verbosity_reward/std": 0.14023631811141968, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2078.0, + "completions/max_terminated_length": 2078.0, + "completions/mean_length": 849.203125, + "completions/mean_terminated_length": 849.203125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.009070294784580499, + "grad_norm": 0.006025367416441441, + "kl": 0.00010214099893346429, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 978105.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -1.0261127948760986, + "rewards/thinking_verbosity_reward/std": 0.12497152388095856, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2811.0, + "completions/mean_length": 801.6328125, + "completions/mean_terminated_length": 792.7294311523438, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "epoch": 0.011337868480725623, + "grad_norm": 0.0076256911270320415, + "kl": 0.00024806000874377787, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 1213963.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -0.9818867444992065, + "rewards/thinking_verbosity_reward/std": 0.092582106590271, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1677.0, + "completions/mean_length": 781.8203125, + "completions/mean_terminated_length": 772.8392333984375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.013605442176870748, + "grad_norm": 0.008066726848483086, + "kl": 6.216949986992404e-05, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 1453613.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.038394808769226, + "rewards/thinking_verbosity_reward/std": 0.11809435486793518, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2167.0, + "completions/mean_length": 829.73046875, + "completions/mean_terminated_length": 820.9373168945312, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.015873015873015872, + "grad_norm": 0.007156250067055225, + "kl": -4.88694931846112e-05, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 1696584.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -0.9898421764373779, + "rewards/thinking_verbosity_reward/std": 0.11081196367740631, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1640.0, + "completions/max_terminated_length": 1640.0, + "completions/mean_length": 803.1484375, + "completions/mean_terminated_length": 803.1484375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.018140589569160998, + "grad_norm": 0.00690761674195528, + "kl": 1.194654032588005e-06, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 1938422.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -0.9544801712036133, + "rewards/thinking_verbosity_reward/std": 0.12256628274917603, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1926.0, + "completions/max_terminated_length": 1926.0, + "completions/mean_length": 800.2265625, + "completions/mean_terminated_length": 800.2265625, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.02040816326530612, + "grad_norm": 0.008399578742682934, + "kl": 0.00015780693502165377, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 2174768.0, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48795005679130554, + "rewards/thinking_verbosity_reward/mean": -0.9820011854171753, + "rewards/thinking_verbosity_reward/std": 0.15239445865154266, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1697.0, + "completions/max_terminated_length": 1697.0, + "completions/mean_length": 783.609375, + "completions/mean_terminated_length": 783.609375, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.022675736961451247, + "grad_norm": 0.0075833238661289215, + "kl": 0.00030040129786357284, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 2410348.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -0.9807782173156738, + "rewards/thinking_verbosity_reward/std": 0.10758854448795319, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1759.0, + "completions/max_terminated_length": 1759.0, + "completions/mean_length": 809.765625, + "completions/mean_terminated_length": 809.765625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.024943310657596373, + "grad_norm": 0.00837654247879982, + "kl": -7.036331226117909e-05, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 2649656.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -0.980780839920044, + "rewards/thinking_verbosity_reward/std": 0.1276409924030304, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2120.0, + "completions/mean_length": 876.53125, + "completions/mean_terminated_length": 867.921630859375, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.027210884353741496, + "grad_norm": 0.005260106176137924, + "kl": -0.0001014049630612135, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 2908208.0, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24397502839565277, + "rewards/thinking_verbosity_reward/mean": -1.0201947689056396, + "rewards/thinking_verbosity_reward/std": 0.16129247844219208, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2722.0, + "completions/max_terminated_length": 2722.0, + "completions/mean_length": 822.765625, + "completions/mean_terminated_length": 822.765625, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "epoch": 0.02947845804988662, + "grad_norm": 0.00634964182972908, + "kl": 4.045761306770146e-05, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 3148052.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -0.9806254506111145, + "rewards/thinking_verbosity_reward/std": 0.08002175390720367, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2638.0, + "completions/mean_length": 827.2578125, + "completions/mean_terminated_length": 818.4549560546875, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.031746031746031744, + "grad_norm": 0.007651267573237419, + "kl": 0.00010258558904752135, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 3394158.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -1.0222628116607666, + "rewards/thinking_verbosity_reward/std": 0.1543615460395813, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1779.0, + "completions/max_terminated_length": 1779.0, + "completions/mean_length": 769.140625, + "completions/mean_terminated_length": 769.140625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.034013605442176874, + "grad_norm": 0.00896177627146244, + "kl": 0.00031310218037106097, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 3624530.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -0.9459785223007202, + "rewards/thinking_verbosity_reward/std": 0.12427378445863724, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2483.0, + "completions/max_terminated_length": 2483.0, + "completions/mean_length": 810.34765625, + "completions/mean_terminated_length": 810.34765625, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.036281179138321996, + "grad_norm": 0.00688561238348484, + "kl": 0.0004608951712725684, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 3864299.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.0219426155090332, + "rewards/thinking_verbosity_reward/std": 0.16693003475666046, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2255.0, + "completions/mean_length": 898.6171875, + "completions/mean_terminated_length": 890.0941772460938, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.03854875283446712, + "grad_norm": 0.006392187438905239, + "kl": 0.0004881410422967747, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 4129497.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -1.0157631635665894, + "rewards/thinking_verbosity_reward/std": 0.1741836965084076, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1916.0, + "completions/max_terminated_length": 1916.0, + "completions/mean_length": 754.09765625, + "completions/mean_terminated_length": 754.09765625, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "epoch": 0.04081632653061224, + "grad_norm": 0.008638097904622555, + "kl": 0.0003899891162291169, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 4352370.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -1.0124788284301758, + "rewards/thinking_verbosity_reward/std": 0.1419954001903534, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1543.0, + "completions/max_terminated_length": 1543.0, + "completions/mean_length": 791.80078125, + "completions/mean_terminated_length": 791.80078125, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "epoch": 0.04308390022675737, + "grad_norm": 0.00813338439911604, + "kl": 0.00013803227921016514, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 4584807.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.0131468772888184, + "rewards/thinking_verbosity_reward/std": 0.14779528975486755, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2304.0, + "completions/max_terminated_length": 2304.0, + "completions/mean_length": 884.6640625, + "completions/mean_terminated_length": 884.6640625, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.045351473922902494, + "grad_norm": 0.006490104366093874, + "kl": 0.0006329713214654475, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 4843881.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0559020042419434, + "rewards/thinking_verbosity_reward/std": 0.15619657933712006, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1922.0, + "completions/mean_length": 872.82421875, + "completions/mean_terminated_length": 855.5078735351562, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "epoch": 0.047619047619047616, + "grad_norm": 0.006365430075675249, + "kl": 0.0008684554268256761, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 5098828.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -0.9918133020401001, + "rewards/thinking_verbosity_reward/std": 0.11725091189146042, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2840.0, + "completions/max_terminated_length": 2840.0, + "completions/mean_length": 927.71875, + "completions/mean_terminated_length": 927.71875, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.049886621315192746, + "grad_norm": 0.005739539861679077, + "kl": 0.0008349915151484311, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 5375012.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0158661603927612, + "rewards/thinking_verbosity_reward/std": 0.102393738925457, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2343.0, + "completions/max_terminated_length": 2343.0, + "completions/mean_length": 846.17578125, + "completions/mean_terminated_length": 846.17578125, + "completions/min_length": 295.0, + "completions/min_terminated_length": 295.0, + "epoch": 0.05215419501133787, + "grad_norm": 0.006932960823178291, + "kl": 0.000741164680221118, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 5625841.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -0.9843477010726929, + "rewards/thinking_verbosity_reward/std": 0.11736638098955154, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2499.0, + "completions/mean_length": 810.15234375, + "completions/mean_terminated_length": 792.342529296875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.05442176870748299, + "grad_norm": 0.007362490054219961, + "kl": 0.0010543569078436121, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 5867200.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -0.9852904081344604, + "rewards/thinking_verbosity_reward/std": 0.2020326405763626, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2391.0, + "completions/max_terminated_length": 2391.0, + "completions/mean_length": 888.44921875, + "completions/mean_terminated_length": 888.44921875, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 0.05668934240362812, + "grad_norm": 0.00539548322558403, + "kl": 0.0013065554085187614, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 6128835.0, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24397502839565277, + "rewards/thinking_verbosity_reward/mean": -1.020519495010376, + "rewards/thinking_verbosity_reward/std": 0.14483477175235748, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2036.0, + "completions/mean_length": 873.74609375, + "completions/mean_terminated_length": 865.1255493164062, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.05895691609977324, + "grad_norm": 0.007669294252991676, + "kl": 0.0010627802548697218, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 6384962.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -0.9768633842468262, + "rewards/thinking_verbosity_reward/std": 0.1765960454940796, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2939.0, + "completions/max_terminated_length": 2939.0, + "completions/mean_length": 821.640625, + "completions/mean_terminated_length": 821.640625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.061224489795918366, + "grad_norm": 0.008534739725291729, + "kl": 0.00196480295562651, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 6626022.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -0.9709947109222412, + "rewards/thinking_verbosity_reward/std": 0.11590465158224106, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2360.0, + "completions/max_terminated_length": 2360.0, + "completions/mean_length": 797.3671875, + "completions/mean_terminated_length": 797.3671875, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "epoch": 0.06349206349206349, + "grad_norm": 0.008207069709897041, + "kl": 0.0016554918547626585, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 6861452.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -0.9580620527267456, + "rewards/thinking_verbosity_reward/std": 0.18858303129673004, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2336.0, + "completions/max_terminated_length": 2336.0, + "completions/mean_length": 790.08203125, + "completions/mean_terminated_length": 790.08203125, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.06575963718820861, + "grad_norm": 0.007825409062206745, + "kl": 0.002067265333607793, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 7098281.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -1.0324976444244385, + "rewards/thinking_verbosity_reward/std": 0.18105538189411163, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2775.0, + "completions/max_terminated_length": 2775.0, + "completions/mean_length": 848.1484375, + "completions/mean_terminated_length": 848.1484375, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.06802721088435375, + "grad_norm": 0.0075764125213027, + "kl": 0.0024665833334438503, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 7347119.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.0623955726623535, + "rewards/thinking_verbosity_reward/std": 0.19967037439346313, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2429.0, + "completions/mean_length": 865.58984375, + "completions/mean_terminated_length": 848.216552734375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.07029478458049887, + "grad_norm": 0.009601961821317673, + "kl": 0.0019374630646780133, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 7604478.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -1.0530279874801636, + "rewards/thinking_verbosity_reward/std": 0.1941586583852768, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2176.0, + "completions/max_terminated_length": 2176.0, + "completions/mean_length": 905.2421875, + "completions/mean_terminated_length": 905.2421875, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 0.07256235827664399, + "grad_norm": 0.006731851492077112, + "kl": 0.0030736134795006365, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 7871228.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.014988660812378, + "rewards/thinking_verbosity_reward/std": 0.17238226532936096, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2620.0, + "completions/mean_length": 948.01953125, + "completions/mean_terminated_length": 933.5393676757812, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.07482993197278912, + "grad_norm": 0.006155460141599178, + "kl": 0.00322791762300767, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 8145105.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -1.027083158493042, + "rewards/thinking_verbosity_reward/std": 0.1459188312292099, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 2963.0, + "completions/max_terminated_length": 2663.0, + "completions/mean_length": 842.22265625, + "completions/mean_terminated_length": 833.9059448242188, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.07709750566893424, + "grad_norm": 0.007661478593945503, + "kl": 0.003542743594152853, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 8391586.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0041967630386353, + "rewards/thinking_verbosity_reward/std": 0.15114328265190125, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2931.0, + "completions/mean_length": 877.49609375, + "completions/mean_terminated_length": 868.8902587890625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.07936507936507936, + "grad_norm": 0.006931083742529154, + "kl": 0.0036731576547026634, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 8649457.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -0.9805034399032593, + "rewards/thinking_verbosity_reward/std": 0.11658073961734772, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2034.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 851.9921875, + "completions/mean_terminated_length": 851.9921875, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 0.08163265306122448, + "grad_norm": 0.006668751128017902, + "kl": 0.005017826275434345, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 8898335.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -0.9645542502403259, + "rewards/thinking_verbosity_reward/std": 0.1490258425474167, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1928.0, + "completions/max_terminated_length": 1928.0, + "completions/mean_length": 890.23828125, + "completions/mean_terminated_length": 890.23828125, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "epoch": 0.08390022675736962, + "grad_norm": 0.00636932672932744, + "kl": 0.004095446696737781, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 9160780.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.0703754425048828, + "rewards/thinking_verbosity_reward/std": 0.149069145321846, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3060.0, + "completions/max_terminated_length": 3060.0, + "completions/mean_length": 944.84765625, + "completions/mean_terminated_length": 936.8275146484375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.08616780045351474, + "grad_norm": 0.007000387646257877, + "kl": 0.005410628335084766, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 9434821.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.043333649635315, + "rewards/thinking_verbosity_reward/std": 0.1250786930322647, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2200.0, + "completions/mean_length": 891.70703125, + "completions/mean_terminated_length": 883.1569213867188, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.08843537414965986, + "grad_norm": 0.006914109457284212, + "kl": 0.005650700011756271, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 9698842.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -1.000028133392334, + "rewards/thinking_verbosity_reward/std": 0.14994162321090698, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3055.0, + "completions/mean_length": 916.61328125, + "completions/mean_terminated_length": 873.6773071289062, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.09070294784580499, + "grad_norm": 0.009213740937411785, + "kl": 0.006022902438417077, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 9963207.0, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.4604927599430084, + "rewards/thinking_verbosity_reward/mean": -1.030358076095581, + "rewards/thinking_verbosity_reward/std": 0.162821963429451, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2393.0, + "completions/mean_length": 931.30078125, + "completions/mean_terminated_length": 922.9059448242188, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "epoch": 0.09297052154195011, + "grad_norm": 0.007363015785813332, + "kl": 0.006109016656409949, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 10235780.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -1.1093066930770874, + "rewards/thinking_verbosity_reward/std": 0.17979653179645538, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2075.0, + "completions/mean_length": 812.515625, + "completions/mean_terminated_length": 803.6549682617188, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.09523809523809523, + "grad_norm": 0.008939437568187714, + "kl": 0.008389145601540804, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 10475024.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -0.9767196178436279, + "rewards/thinking_verbosity_reward/std": 0.14639614522457123, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2830.0, + "completions/mean_length": 1004.48828125, + "completions/mean_terminated_length": 971.6707153320312, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.09750566893424037, + "grad_norm": 0.006452460307627916, + "kl": 0.005963592091575265, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 10764629.0, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17536810040473938, + "rewards/thinking_verbosity_reward/mean": -1.0999956130981445, + "rewards/thinking_verbosity_reward/std": 0.22845692932605743, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2977.0, + "completions/mean_length": 983.109375, + "completions/mean_terminated_length": 941.498046875, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.09977324263038549, + "grad_norm": 0.006443258374929428, + "kl": 0.006700891710352153, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 11048705.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -1.1343746185302734, + "rewards/thinking_verbosity_reward/std": 0.18737705051898956, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3060.0, + "completions/max_terminated_length": 3060.0, + "completions/mean_length": 849.5703125, + "completions/mean_terminated_length": 849.5703125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.10204081632653061, + "grad_norm": 0.0067588756792247295, + "kl": 0.006657186604570597, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 11298635.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -0.9997424483299255, + "rewards/thinking_verbosity_reward/std": 0.1527765840291977, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2589.0, + "completions/mean_length": 835.453125, + "completions/mean_terminated_length": 826.682373046875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.10430839002267574, + "grad_norm": 0.00891664158552885, + "kl": 0.009914702794048935, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 11543655.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -0.9916813969612122, + "rewards/thinking_verbosity_reward/std": 0.1497543305158615, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2774.0, + "completions/mean_length": 902.14453125, + "completions/mean_terminated_length": 893.6353759765625, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.10657596371882086, + "grad_norm": 0.00908136460930109, + "kl": 0.007933848013635725, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 11805636.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.155268669128418, + "rewards/thinking_verbosity_reward/std": 0.3218294084072113, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2950.0, + "completions/mean_length": 857.39453125, + "completions/mean_terminated_length": 848.7098388671875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.10884353741496598, + "grad_norm": 0.007951783947646618, + "kl": 0.009120354079641402, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 12059521.0, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24397502839565277, + "rewards/thinking_verbosity_reward/mean": -1.0336220264434814, + "rewards/thinking_verbosity_reward/std": 0.13297787308692932, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2500.0, + "completions/max_terminated_length": 2500.0, + "completions/mean_length": 906.10546875, + "completions/mean_terminated_length": 906.10546875, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "epoch": 0.1111111111111111, + "grad_norm": 0.008099460043013096, + "kl": 0.008504969417117536, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 12321284.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -0.9789541363716125, + "rewards/thinking_verbosity_reward/std": 0.12617865204811096, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3067.0, + "completions/mean_length": 849.5859375, + "completions/mean_terminated_length": 823.2332153320312, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.11337868480725624, + "grad_norm": 0.006570389028638601, + "kl": 0.009818906255532056, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 12571434.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -0.9381024241447449, + "rewards/thinking_verbosity_reward/std": 0.11786402016878128, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2927.0, + "completions/mean_length": 828.61328125, + "completions/mean_terminated_length": 810.9487915039062, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.11564625850340136, + "grad_norm": 0.00906436424702406, + "kl": 0.010079486761242151, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 12814831.0, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "rewards/thinking_verbosity_reward/mean": -0.9193981289863586, + "rewards/thinking_verbosity_reward/std": 0.15666411817073822, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2339.0, + "completions/mean_length": 905.73046875, + "completions/mean_terminated_length": 888.6732177734375, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.11791383219954649, + "grad_norm": 0.009035835042595863, + "kl": 0.008975302102044225, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 13078226.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -1.0100451707839966, + "rewards/thinking_verbosity_reward/std": 0.17275826632976532, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 911.98828125, + "completions/mean_terminated_length": 886.3755493164062, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.12018140589569161, + "grad_norm": 0.007157270330935717, + "kl": 0.008368385955691338, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 13341199.0, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17536810040473938, + "rewards/thinking_verbosity_reward/mean": -1.0113205909729004, + "rewards/thinking_verbosity_reward/std": 0.18431605398654938, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2299.0, + "completions/mean_length": 910.03515625, + "completions/mean_terminated_length": 893.0117797851562, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.12244897959183673, + "grad_norm": 0.006608675234019756, + "kl": 0.010004184092395008, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 13610336.0, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24397502839565277, + "rewards/thinking_verbosity_reward/mean": -1.1097066402435303, + "rewards/thinking_verbosity_reward/std": 0.1976967453956604, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2420.0, + "completions/mean_length": 941.30859375, + "completions/mean_terminated_length": 916.0435180664062, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.12471655328798185, + "grad_norm": 0.00935720931738615, + "kl": 0.009216500504408032, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 13883135.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -1.0746923685073853, + "rewards/thinking_verbosity_reward/std": 0.2892882823944092, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2294.0, + "completions/mean_length": 920.49609375, + "completions/mean_terminated_length": 903.5551147460938, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.12698412698412698, + "grad_norm": 0.00840294174849987, + "kl": 0.00920592702459544, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 14150286.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.1645448207855225, + "rewards/thinking_verbosity_reward/std": 0.1768539398908615, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2761.0, + "completions/mean_length": 864.640625, + "completions/mean_terminated_length": 847.2598266601562, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.1292517006802721, + "grad_norm": 0.009047860279679298, + "kl": 0.009754394181072712, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 14403314.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.0208035707473755, + "rewards/thinking_verbosity_reward/std": 0.1702914983034134, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2394.0, + "completions/max_terminated_length": 2394.0, + "completions/mean_length": 851.94921875, + "completions/mean_terminated_length": 851.94921875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.13151927437641722, + "grad_norm": 0.008113497868180275, + "kl": 0.01064593461342156, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 14651781.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -1.0482971668243408, + "rewards/thinking_verbosity_reward/std": 0.1441270262002945, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2998.0, + "completions/mean_length": 958.546875, + "completions/mean_terminated_length": 916.4462280273438, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.13378684807256236, + "grad_norm": 0.006745356135070324, + "kl": 0.010976960067637265, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 14934425.0, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17536810040473938, + "rewards/thinking_verbosity_reward/mean": -1.054985761642456, + "rewards/thinking_verbosity_reward/std": 0.312842458486557, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2402.0, + "completions/mean_length": 890.8203125, + "completions/mean_terminated_length": 882.2667236328125, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.1360544217687075, + "grad_norm": 0.007630802225321531, + "kl": 0.010621859692037106, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 15193083.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -0.9835863709449768, + "rewards/thinking_verbosity_reward/std": 0.16115634143352509, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2950.0, + "completions/mean_length": 879.50390625, + "completions/mean_terminated_length": 862.2401733398438, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.1383219954648526, + "grad_norm": 0.010134682059288025, + "kl": 0.0115025385748595, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 15449332.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -0.9814037680625916, + "rewards/thinking_verbosity_reward/std": 0.20800180733203888, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2630.0, + "completions/mean_length": 880.2890625, + "completions/mean_terminated_length": 854.3004150390625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.14058956916099774, + "grad_norm": 0.007968141697347164, + "kl": 0.009277043398469687, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 15709606.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -1.0375423431396484, + "rewards/thinking_verbosity_reward/std": 0.15858420729637146, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2967.0, + "completions/mean_length": 835.12109375, + "completions/mean_terminated_length": 817.5078735351562, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.14285714285714285, + "grad_norm": 0.008828379213809967, + "kl": 0.013155567343346775, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 15957597.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.0363775491714478, + "rewards/thinking_verbosity_reward/std": 0.16792258620262146, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2411.0, + "completions/mean_length": 889.05078125, + "completions/mean_terminated_length": 854.4008178710938, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.14512471655328799, + "grad_norm": 0.008275152184069157, + "kl": 0.012239311239682138, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 16217514.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.0302348136901855, + "rewards/thinking_verbosity_reward/std": 0.15868715941905975, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2801.0, + "completions/mean_length": 850.17578125, + "completions/mean_terminated_length": 832.6810913085938, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.1473922902494331, + "grad_norm": 0.008566019125282764, + "kl": 0.010176013456657529, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 16465591.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.0878468751907349, + "rewards/thinking_verbosity_reward/std": 0.24354270100593567, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2797.0, + "completions/max_terminated_length": 2797.0, + "completions/mean_length": 903.0234375, + "completions/mean_terminated_length": 903.0234375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.14965986394557823, + "grad_norm": 0.008970546536147594, + "kl": 0.012535563902929425, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 16730221.0, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.4604927599430084, + "rewards/thinking_verbosity_reward/mean": -0.9271892309188843, + "rewards/thinking_verbosity_reward/std": 0.17539002001285553, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2916.0, + "completions/max_terminated_length": 2916.0, + "completions/mean_length": 819.17578125, + "completions/mean_terminated_length": 819.17578125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.15192743764172337, + "grad_norm": 0.007922605611383915, + "kl": 0.011832835793029517, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 16973490.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.0275592803955078, + "rewards/thinking_verbosity_reward/std": 0.17587842047214508, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2508.0, + "completions/mean_length": 897.40625, + "completions/mean_terminated_length": 854.087646484375, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.15419501133786848, + "grad_norm": 0.010056265629827976, + "kl": 0.010913274949416518, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 17236066.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -0.9707518815994263, + "rewards/thinking_verbosity_reward/std": 0.16968779265880585, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2921.0, + "completions/mean_length": 910.80078125, + "completions/mean_terminated_length": 893.783447265625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.1564625850340136, + "grad_norm": 0.008754831738770008, + "kl": 0.011136114713735878, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 17500287.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.0433489084243774, + "rewards/thinking_verbosity_reward/std": 0.206964910030365, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2663.0, + "completions/mean_length": 851.38671875, + "completions/mean_terminated_length": 833.9015502929688, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.15873015873015872, + "grad_norm": 0.008038002997636795, + "kl": 0.010170459048822522, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 17750082.0, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "rewards/thinking_verbosity_reward/mean": -0.8939105868339539, + "rewards/thinking_verbosity_reward/std": 0.14066872000694275, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2918.0, + "completions/max_terminated_length": 2918.0, + "completions/mean_length": 881.68359375, + "completions/mean_terminated_length": 881.68359375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.16099773242630386, + "grad_norm": 0.008496596477925777, + "kl": 0.012851954088546336, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 18008241.0, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42695629596710205, + "rewards/thinking_verbosity_reward/mean": -1.0300571918487549, + "rewards/thinking_verbosity_reward/std": 0.14637769758701324, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3028.0, + "completions/mean_length": 919.2890625, + "completions/mean_terminated_length": 910.8471069335938, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.16326530612244897, + "grad_norm": 0.008254647254943848, + "kl": 0.01116384391207248, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 18277195.0, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.501733124256134, + "rewards/thinking_verbosity_reward/mean": -1.0381544828414917, + "rewards/thinking_verbosity_reward/std": 0.20433998107910156, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2372.0, + "completions/mean_length": 852.96484375, + "completions/mean_terminated_length": 835.4921264648438, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.1655328798185941, + "grad_norm": 0.005581771954894066, + "kl": 0.01010585529729724, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 18531074.0, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.125, + "rewards/thinking_verbosity_reward/mean": -1.0642929077148438, + "rewards/thinking_verbosity_reward/std": 0.1985449641942978, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2385.0, + "completions/mean_length": 869.515625, + "completions/mean_terminated_length": 852.1732177734375, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.16780045351473924, + "grad_norm": 0.008788047358393669, + "kl": 0.011326493811793625, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 18788710.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.0177788734436035, + "rewards/thinking_verbosity_reward/std": 0.17464584112167358, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2087.0, + "completions/max_terminated_length": 2087.0, + "completions/mean_length": 874.71484375, + "completions/mean_terminated_length": 874.71484375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.17006802721088435, + "grad_norm": 0.007407894358038902, + "kl": 0.007880535093136132, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 19046661.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -1.0330013036727905, + "rewards/thinking_verbosity_reward/std": 0.21459190547466278, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2378.0, + "completions/mean_length": 853.3203125, + "completions/mean_terminated_length": 818.1032104492188, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 0.17233560090702948, + "grad_norm": 0.0077790855430066586, + "kl": 0.01125244895229116, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 19295935.0, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24397502839565277, + "rewards/thinking_verbosity_reward/mean": -1.030945897102356, + "rewards/thinking_verbosity_reward/std": 0.15482372045516968, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1903.0, + "completions/mean_length": 890.66796875, + "completions/mean_terminated_length": 873.4921264648438, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.1746031746031746, + "grad_norm": 0.0075124152936041355, + "kl": 0.008916582562960684, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 19560954.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0838379859924316, + "rewards/thinking_verbosity_reward/std": 0.22865712642669678, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1680.0, + "completions/mean_length": 820.52734375, + "completions/mean_terminated_length": 802.7991943359375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.17687074829931973, + "grad_norm": 0.009709039703011513, + "kl": 0.011968876933678985, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 19803321.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -0.9957240223884583, + "rewards/thinking_verbosity_reward/std": 0.14698883891105652, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2181.0, + "completions/mean_length": 821.203125, + "completions/mean_terminated_length": 803.4802856445312, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "epoch": 0.17913832199546487, + "grad_norm": 0.009636872448027134, + "kl": 0.011313010472804308, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 20045637.0, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.467176616191864, + "rewards/thinking_verbosity_reward/mean": -1.0173425674438477, + "rewards/thinking_verbosity_reward/std": 0.21815963089466095, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2627.0, + "completions/mean_length": 844.0, + "completions/mean_terminated_length": 835.2628173828125, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.18140589569160998, + "grad_norm": 0.008047728799283504, + "kl": 0.01244122232310474, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 20293405.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -0.9766184687614441, + "rewards/thinking_verbosity_reward/std": 0.13644269108772278, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3043.0, + "completions/mean_length": 974.50390625, + "completions/mean_terminated_length": 932.7211303710938, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.1836734693877551, + "grad_norm": 0.008809532970190048, + "kl": 0.010698359226807952, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 20573358.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -1.1138408184051514, + "rewards/thinking_verbosity_reward/std": 0.2117185741662979, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2555.0, + "completions/mean_length": 955.546875, + "completions/mean_terminated_length": 921.9524536132812, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.18594104308390022, + "grad_norm": 0.008250582963228226, + "kl": 0.00942709285300225, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 20851578.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -1.0413832664489746, + "rewards/thinking_verbosity_reward/std": 0.2592982351779938, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3007.0, + "completions/mean_length": 907.62109375, + "completions/mean_terminated_length": 890.5787353515625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.18820861678004536, + "grad_norm": 0.007970296777784824, + "kl": 0.010147892055101693, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 21117649.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -1.0679872035980225, + "rewards/thinking_verbosity_reward/std": 0.11830828338861465, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2962.0, + "completions/mean_length": 929.6171875, + "completions/mean_terminated_length": 921.2157592773438, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.19047619047619047, + "grad_norm": 0.008969978429377079, + "kl": 0.01110009045805782, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 21386239.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0927581787109375, + "rewards/thinking_verbosity_reward/std": 0.19533401727676392, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2217.0, + "completions/mean_length": 941.21484375, + "completions/mean_terminated_length": 932.85888671875, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.1927437641723356, + "grad_norm": 0.007240003440529108, + "kl": 0.009909643791615963, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 21662222.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -0.9847238063812256, + "rewards/thinking_verbosity_reward/std": 0.14600031077861786, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2155.0, + "completions/mean_length": 983.05859375, + "completions/mean_terminated_length": 966.6102294921875, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.19501133786848074, + "grad_norm": 0.006852799095213413, + "kl": 0.009759980021044612, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 21947565.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -1.1069188117980957, + "rewards/thinking_verbosity_reward/std": 0.15674066543579102, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1961.0, + "completions/mean_length": 844.203125, + "completions/mean_terminated_length": 817.78662109375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 0.19727891156462585, + "grad_norm": 0.007779599167406559, + "kl": 0.010700178449042141, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 22196393.0, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24397502839565277, + "rewards/thinking_verbosity_reward/mean": -1.0626180171966553, + "rewards/thinking_verbosity_reward/std": 0.11761243641376495, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 889.59375, + "completions/mean_terminated_length": 872.409423828125, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.19954648526077098, + "grad_norm": 0.008237868547439575, + "kl": 0.009312348905950785, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 22457593.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -1.0276521444320679, + "rewards/thinking_verbosity_reward/std": 0.21451638638973236, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2486.0, + "completions/max_terminated_length": 2486.0, + "completions/mean_length": 909.5703125, + "completions/mean_terminated_length": 909.5703125, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.2018140589569161, + "grad_norm": 0.0089192483574152, + "kl": 0.01075939426664263, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 22722451.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.1140661239624023, + "rewards/thinking_verbosity_reward/std": 0.16711047291755676, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2503.0, + "completions/mean_length": 872.30078125, + "completions/mean_terminated_length": 863.674560546875, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.20408163265306123, + "grad_norm": 0.010932698845863342, + "kl": 0.010983020882122219, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 22976112.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -1.0482385158538818, + "rewards/thinking_verbosity_reward/std": 0.18437141180038452, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2476.0, + "completions/mean_length": 849.87109375, + "completions/mean_terminated_length": 823.5217895507812, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.20634920634920634, + "grad_norm": 0.007904685102403164, + "kl": 0.012142079416662455, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 23227431.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0475993156433105, + "rewards/thinking_verbosity_reward/std": 0.14112865924835205, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2542.0, + "completions/mean_length": 890.86328125, + "completions/mean_terminated_length": 882.3098754882812, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.20861678004535147, + "grad_norm": 0.008765889331698418, + "kl": 0.009628154919482768, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 23488140.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -0.9875479340553284, + "rewards/thinking_verbosity_reward/std": 0.18977628648281097, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2552.0, + "completions/mean_length": 919.078125, + "completions/mean_terminated_length": 902.1259765625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.2108843537414966, + "grad_norm": 0.008633551187813282, + "kl": 0.011130126425996423, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 23755136.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -1.0010552406311035, + "rewards/thinking_verbosity_reward/std": 0.1642613261938095, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1618.0, + "completions/mean_length": 857.76171875, + "completions/mean_terminated_length": 840.3267822265625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.21315192743764172, + "grad_norm": 0.006649704184383154, + "kl": 0.009582348458934575, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 24005411.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -1.0377217531204224, + "rewards/thinking_verbosity_reward/std": 0.15964001417160034, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2820.0, + "completions/mean_length": 890.55859375, + "completions/mean_terminated_length": 882.0039672851562, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.21541950113378686, + "grad_norm": 0.009338507428765297, + "kl": 0.01146495551802218, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 24267402.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.065229892730713, + "rewards/thinking_verbosity_reward/std": 0.27791082859039307, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2727.0, + "completions/mean_length": 851.70703125, + "completions/mean_terminated_length": 816.46435546875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.21768707482993196, + "grad_norm": 0.011838492006063461, + "kl": 0.010163495084270835, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 24518575.0, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.4787135720252991, + "rewards/thinking_verbosity_reward/mean": -1.0004703998565674, + "rewards/thinking_verbosity_reward/std": 0.20691539347171783, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2909.0, + "completions/mean_length": 858.62109375, + "completions/mean_terminated_length": 823.4881591796875, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.2199546485260771, + "grad_norm": 0.009089348837733269, + "kl": 0.01091561233624816, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 24773862.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -1.0555992126464844, + "rewards/thinking_verbosity_reward/std": 0.1630784422159195, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2987.0, + "completions/mean_length": 921.55078125, + "completions/mean_terminated_length": 843.1943359375, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.2222222222222222, + "grad_norm": 0.009785081259906292, + "kl": 0.012216721661388874, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 25042371.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -1.0962475538253784, + "rewards/thinking_verbosity_reward/std": 0.3061525225639343, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2669.0, + "completions/mean_length": 917.22265625, + "completions/mean_terminated_length": 900.2559204101562, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.22448979591836735, + "grad_norm": 0.008996937423944473, + "kl": 0.008763862191699445, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 25310228.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0787091255187988, + "rewards/thinking_verbosity_reward/std": 0.2066836804151535, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2949.0, + "completions/mean_length": 814.37109375, + "completions/mean_terminated_length": 805.5177001953125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.22675736961451248, + "grad_norm": 0.008593808859586716, + "kl": 0.013185960589908063, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 25548659.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -1.0029938220977783, + "rewards/thinking_verbosity_reward/std": 0.2186257541179657, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2883.0, + "completions/mean_length": 865.9453125, + "completions/mean_terminated_length": 857.294189453125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.2290249433106576, + "grad_norm": 0.009650018997490406, + "kl": 0.011625583516433835, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 25805709.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.0243103504180908, + "rewards/thinking_verbosity_reward/std": 0.12185460329055786, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2398.0, + "completions/mean_length": 864.71484375, + "completions/mean_terminated_length": 829.6785888671875, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.23129251700680273, + "grad_norm": 0.008408834226429462, + "kl": 0.012548828963190317, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 26058804.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -0.9666011333465576, + "rewards/thinking_verbosity_reward/std": 0.2213297188282013, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2131.0, + "completions/mean_length": 944.86328125, + "completions/mean_terminated_length": 936.5216064453125, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.23356009070294784, + "grad_norm": 0.010196613147854805, + "kl": 0.010808886028826237, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 26333361.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -1.0968388319015503, + "rewards/thinking_verbosity_reward/std": 0.1619306057691574, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2296.0, + "completions/mean_length": 868.296875, + "completions/mean_terminated_length": 859.6549682617188, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.23582766439909297, + "grad_norm": 0.009309634566307068, + "kl": 0.011414475389756262, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 26588229.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.0811493396759033, + "rewards/thinking_verbosity_reward/std": 0.24483317136764526, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1762.0, + "completions/max_terminated_length": 1762.0, + "completions/mean_length": 813.72265625, + "completions/mean_terminated_length": 813.72265625, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.23809523809523808, + "grad_norm": 0.00860657263547182, + "kl": 0.01347862952388823, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 26831462.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -1.003593921661377, + "rewards/thinking_verbosity_reward/std": 0.18947094678878784, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2591.0, + "completions/mean_length": 887.32421875, + "completions/mean_terminated_length": 843.8048095703125, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.24036281179138322, + "grad_norm": 0.008666235953569412, + "kl": 0.012982807587832212, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 27089849.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.0512151718139648, + "rewards/thinking_verbosity_reward/std": 0.22604534029960632, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2507.0, + "completions/mean_length": 854.6875, + "completions/mean_terminated_length": 845.9921875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "epoch": 0.24263038548752835, + "grad_norm": 0.010318150743842125, + "kl": 0.013109323685057461, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 27344041.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -0.9696528315544128, + "rewards/thinking_verbosity_reward/std": 0.1526484489440918, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2407.0, + "completions/mean_length": 840.44921875, + "completions/mean_terminated_length": 831.6980590820312, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.24489795918367346, + "grad_norm": 0.009137955494225025, + "kl": 0.016236132942140102, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 27590388.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -1.05830979347229, + "rewards/thinking_verbosity_reward/std": 0.1881929486989975, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3042.0, + "completions/mean_length": 918.33984375, + "completions/mean_terminated_length": 892.8024291992188, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.2471655328798186, + "grad_norm": 0.010082253254950047, + "kl": 0.012006666045635939, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 27858491.0, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.4787135720252991, + "rewards/thinking_verbosity_reward/mean": -1.0163030624389648, + "rewards/thinking_verbosity_reward/std": 0.140503391623497, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2945.0, + "completions/mean_length": 851.7734375, + "completions/mean_terminated_length": 816.5317993164062, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.2494331065759637, + "grad_norm": 0.011952441185712814, + "kl": 0.012579790083691478, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 28107825.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.0410233736038208, + "rewards/thinking_verbosity_reward/std": 0.26076218485832214, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2924.0, + "completions/max_terminated_length": 2924.0, + "completions/mean_length": 926.53125, + "completions/mean_terminated_length": 926.53125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.25170068027210885, + "grad_norm": 0.008329563774168491, + "kl": 0.010093003744259477, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 28377033.0, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.125, + "rewards/thinking_verbosity_reward/mean": -1.0671601295471191, + "rewards/thinking_verbosity_reward/std": 0.1745661050081253, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2950.0, + "completions/mean_length": 868.2578125, + "completions/mean_terminated_length": 859.61572265625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.25396825396825395, + "grad_norm": 0.010890102945268154, + "kl": 0.013749998295679688, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 28632667.0, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "rewards/thinking_verbosity_reward/mean": -0.9976963400840759, + "rewards/thinking_verbosity_reward/std": 0.11069928109645844, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2782.0, + "completions/mean_length": 945.9609375, + "completions/mean_terminated_length": 929.220458984375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.2562358276643991, + "grad_norm": 0.008435415104031563, + "kl": 0.011849641450680792, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 28907873.0, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.4604927599430084, + "rewards/thinking_verbosity_reward/mean": -1.0127184391021729, + "rewards/thinking_verbosity_reward/std": 0.16555339097976685, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2759.0, + "completions/mean_length": 918.796875, + "completions/mean_terminated_length": 901.842529296875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.2585034013605442, + "grad_norm": 0.009460230357944965, + "kl": 0.012644457630813122, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 29173461.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -0.9719576239585876, + "rewards/thinking_verbosity_reward/std": 0.11471861600875854, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2641.0, + "completions/mean_length": 939.60546875, + "completions/mean_terminated_length": 931.2431640625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.26077097505668934, + "grad_norm": 0.008952354080975056, + "kl": 0.01107516314368695, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 29448088.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.090989589691162, + "rewards/thinking_verbosity_reward/std": 0.20455002784729004, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2620.0, + "completions/mean_length": 914.5546875, + "completions/mean_terminated_length": 844.9596557617188, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.26303854875283444, + "grad_norm": 0.009088842198252678, + "kl": 0.013067572028376162, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 29712398.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.0013940334320068, + "rewards/thinking_verbosity_reward/std": 0.13177350163459778, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2587.0, + "completions/mean_length": 875.390625, + "completions/mean_terminated_length": 858.094482421875, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.2653061224489796, + "grad_norm": 0.009793553501367569, + "kl": 0.01489451399538666, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 29973290.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -1.1064176559448242, + "rewards/thinking_verbosity_reward/std": 0.14793646335601807, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3041.0, + "completions/mean_length": 886.19921875, + "completions/mean_terminated_length": 842.6574096679688, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.2675736961451247, + "grad_norm": 0.011109042912721634, + "kl": 0.01454357139300555, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 30232997.0, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "rewards/thinking_verbosity_reward/mean": -1.0531014204025269, + "rewards/thinking_verbosity_reward/std": 0.2871605157852173, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2854.0, + "completions/mean_length": 866.57421875, + "completions/mean_terminated_length": 831.5675048828125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.2698412698412698, + "grad_norm": 0.008669278584420681, + "kl": 0.015019292011857033, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 30489568.0, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.4836103618144989, + "rewards/thinking_verbosity_reward/mean": -0.9399356245994568, + "rewards/thinking_verbosity_reward/std": 0.14253069460391998, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2513.0, + "completions/mean_length": 836.88671875, + "completions/mean_terminated_length": 828.1216430664062, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.272108843537415, + "grad_norm": 0.010173976421356201, + "kl": 0.013380541233345866, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 30739955.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0224072933197021, + "rewards/thinking_verbosity_reward/std": 0.22703179717063904, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2902.0, + "completions/mean_length": 925.88671875, + "completions/mean_terminated_length": 877.26806640625, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.2743764172335601, + "grad_norm": 0.00992068462073803, + "kl": 0.013840829604305327, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 31007398.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -1.0768048763275146, + "rewards/thinking_verbosity_reward/std": 0.27792978286743164, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2252.0, + "completions/mean_length": 818.8125, + "completions/mean_terminated_length": 792.0949096679688, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.2766439909297052, + "grad_norm": 0.006118103861808777, + "kl": 0.014967744937166572, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 31247142.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -0.9979598522186279, + "rewards/thinking_verbosity_reward/std": 0.14602535963058472, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1713.0, + "completions/mean_length": 789.0234375, + "completions/mean_terminated_length": 743.5458374023438, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.2789115646258503, + "grad_norm": 0.010149531997740269, + "kl": 0.01555039978120476, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 31479020.0, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "rewards/thinking_verbosity_reward/mean": -1.0046645402908325, + "rewards/thinking_verbosity_reward/std": 0.2573230564594269, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2861.0, + "completions/mean_length": 888.98828125, + "completions/mean_terminated_length": 845.5020141601562, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.2811791383219955, + "grad_norm": 0.007618566509336233, + "kl": 0.014443471562117338, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 31740697.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.016685962677002, + "rewards/thinking_verbosity_reward/std": 0.17989718914031982, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2648.0, + "completions/max_terminated_length": 2648.0, + "completions/mean_length": 910.49609375, + "completions/mean_terminated_length": 910.49609375, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "epoch": 0.2834467120181406, + "grad_norm": 0.007611451670527458, + "kl": 0.01170169620309025, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 32009104.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -1.0677266120910645, + "rewards/thinking_verbosity_reward/std": 0.236875519156456, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2159.0, + "completions/mean_length": 925.77734375, + "completions/mean_terminated_length": 908.8779296875, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.2857142857142857, + "grad_norm": 0.0084237614646554, + "kl": 0.013823596760630608, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 32277935.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -0.9637631177902222, + "rewards/thinking_verbosity_reward/std": 0.2284972220659256, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3039.0, + "completions/mean_length": 841.3046875, + "completions/mean_terminated_length": 796.8685302734375, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.28798185941043086, + "grad_norm": 0.008471387438476086, + "kl": 0.016321102622896433, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 32521421.0, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.467176616191864, + "rewards/thinking_verbosity_reward/mean": -1.0287491083145142, + "rewards/thinking_verbosity_reward/std": 0.23041406273841858, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2604.0, + "completions/mean_length": 872.484375, + "completions/mean_terminated_length": 855.1653442382812, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "epoch": 0.29024943310657597, + "grad_norm": 0.008569279685616493, + "kl": 0.012772416695952415, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 32777385.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -0.9863669872283936, + "rewards/thinking_verbosity_reward/std": 0.200619176030159, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2362.0, + "completions/mean_length": 891.78515625, + "completions/mean_terminated_length": 874.6181030273438, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.2925170068027211, + "grad_norm": 0.010028647258877754, + "kl": 0.011466184630990028, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 33040218.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -1.026758074760437, + "rewards/thinking_verbosity_reward/std": 0.18459023535251617, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2198.0, + "completions/max_terminated_length": 2198.0, + "completions/mean_length": 886.23046875, + "completions/mean_terminated_length": 886.23046875, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.2947845804988662, + "grad_norm": 0.00939104799181223, + "kl": 0.013060049852356315, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 33298669.0, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24397502839565277, + "rewards/thinking_verbosity_reward/mean": -1.044893503189087, + "rewards/thinking_verbosity_reward/std": 0.19919034838676453, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2905.0, + "completions/mean_length": 891.8515625, + "completions/mean_terminated_length": 830.5621948242188, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.29705215419501135, + "grad_norm": 0.009373553097248077, + "kl": 0.013821915141306818, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 33560855.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -1.0614581108093262, + "rewards/thinking_verbosity_reward/std": 0.2787906527519226, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2398.0, + "completions/mean_length": 898.453125, + "completions/mean_terminated_length": 837.349365234375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.29931972789115646, + "grad_norm": 0.00858219899237156, + "kl": 0.015290155191905797, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 33826363.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.0381971597671509, + "rewards/thinking_verbosity_reward/std": 0.2640462815761566, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2867.0, + "completions/mean_length": 928.6484375, + "completions/mean_terminated_length": 911.7716674804688, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "epoch": 0.30158730158730157, + "grad_norm": 0.008055904880166054, + "kl": 0.013601035811007023, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 34098841.0, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.125, + "rewards/thinking_verbosity_reward/mean": -1.1090810298919678, + "rewards/thinking_verbosity_reward/std": 0.20494304597377777, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2413.0, + "completions/max_terminated_length": 2413.0, + "completions/mean_length": 848.66015625, + "completions/mean_terminated_length": 848.66015625, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.30385487528344673, + "grad_norm": 0.010503564961254597, + "kl": 0.014706011395901442, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 34346578.0, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.4604927599430084, + "rewards/thinking_verbosity_reward/mean": -0.9905521273612976, + "rewards/thinking_verbosity_reward/std": 0.15640385448932648, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2729.0, + "completions/mean_length": 919.6484375, + "completions/mean_terminated_length": 876.77294921875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.30612244897959184, + "grad_norm": 0.00861930102109909, + "kl": 0.014272409258410335, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 34618920.0, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "rewards/thinking_verbosity_reward/mean": -1.0714051723480225, + "rewards/thinking_verbosity_reward/std": 0.16850516200065613, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 844.33203125, + "completions/mean_terminated_length": 826.7913208007812, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.30839002267573695, + "grad_norm": 0.008574748411774635, + "kl": 0.015117767732590437, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 34868045.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.0222407579421997, + "rewards/thinking_verbosity_reward/std": 0.1451067179441452, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2638.0, + "completions/mean_length": 876.90625, + "completions/mean_terminated_length": 824.2240600585938, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.31065759637188206, + "grad_norm": 0.008331910707056522, + "kl": 0.013488045427948236, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 35126565.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -1.00288724899292, + "rewards/thinking_verbosity_reward/std": 0.21602074801921844, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2671.0, + "completions/mean_length": 862.43359375, + "completions/mean_terminated_length": 791.1572265625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.3129251700680272, + "grad_norm": 0.011784237809479237, + "kl": 0.017493271152488887, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 35380924.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -1.0489051342010498, + "rewards/thinking_verbosity_reward/std": 0.24461053311824799, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2794.0, + "completions/mean_length": 878.68359375, + "completions/mean_terminated_length": 843.8690795898438, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.31519274376417233, + "grad_norm": 0.009175069630146027, + "kl": 0.01454641402233392, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 35639451.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -0.9692325592041016, + "rewards/thinking_verbosity_reward/std": 0.15597370266914368, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3070.0, + "completions/mean_length": 898.75390625, + "completions/mean_terminated_length": 890.2314453125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.31746031746031744, + "grad_norm": 0.009722675196826458, + "kl": 0.016416668659076095, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 35900876.0, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42695629596710205, + "rewards/thinking_verbosity_reward/mean": -1.0701587200164795, + "rewards/thinking_verbosity_reward/std": 0.1874919831752777, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2815.0, + "completions/mean_length": 936.87890625, + "completions/mean_terminated_length": 876.8554077148438, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.3197278911564626, + "grad_norm": 0.010307110846042633, + "kl": 0.013551881187595427, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 36172317.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -0.9118460416793823, + "rewards/thinking_verbosity_reward/std": 0.1442745476961136, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3005.0, + "completions/mean_length": 915.80078125, + "completions/mean_terminated_length": 864.0520629882812, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.3219954648526077, + "grad_norm": 0.010339327156543732, + "kl": 0.0184286676812917, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 36442226.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.0368635654449463, + "rewards/thinking_verbosity_reward/std": 0.26665186882019043, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2158.0, + "completions/mean_length": 905.1328125, + "completions/mean_terminated_length": 888.0708618164062, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.3242630385487528, + "grad_norm": 0.00704536447301507, + "kl": 0.01663792517501861, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 36707916.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.039925217628479, + "rewards/thinking_verbosity_reward/std": 0.19076105952262878, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3045.0, + "completions/mean_length": 941.70703125, + "completions/mean_terminated_length": 881.8192138671875, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "epoch": 0.32653061224489793, + "grad_norm": 0.00856408104300499, + "kl": 0.013732295366935432, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 36982385.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -1.0040342807769775, + "rewards/thinking_verbosity_reward/std": 0.12567627429962158, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2531.0, + "completions/mean_length": 814.390625, + "completions/mean_terminated_length": 796.6141967773438, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "epoch": 0.3287981859410431, + "grad_norm": 0.010986111126840115, + "kl": 0.017980167642235756, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 37223405.0, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.467176616191864, + "rewards/thinking_verbosity_reward/mean": -1.023991584777832, + "rewards/thinking_verbosity_reward/std": 0.1845090687274933, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2361.0, + "completions/mean_length": 900.828125, + "completions/mean_terminated_length": 875.0830688476562, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "epoch": 0.3310657596371882, + "grad_norm": 0.008425871841609478, + "kl": 0.015716631896793842, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 37488625.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.0692064762115479, + "rewards/thinking_verbosity_reward/std": 0.25518599152565, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2718.0, + "completions/max_terminated_length": 2718.0, + "completions/mean_length": 887.46484375, + "completions/mean_terminated_length": 887.46484375, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "epoch": 0.3333333333333333, + "grad_norm": 0.007951130159199238, + "kl": 0.012733308016322553, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 37753736.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -1.0785900354385376, + "rewards/thinking_verbosity_reward/std": 0.154441237449646, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3019.0, + "completions/mean_length": 924.8984375, + "completions/mean_terminated_length": 882.1275024414062, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "epoch": 0.3356009070294785, + "grad_norm": 0.009753545746207237, + "kl": 0.014088809140957892, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 38021094.0, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.5, + "rewards/thinking_verbosity_reward/mean": -1.0492494106292725, + "rewards/thinking_verbosity_reward/std": 0.3139822483062744, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2724.0, + "completions/mean_length": 929.89453125, + "completions/mean_terminated_length": 842.8170166015625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.3378684807256236, + "grad_norm": 0.009636150673031807, + "kl": 0.015885672066360712, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 38287571.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -0.9434583187103271, + "rewards/thinking_verbosity_reward/std": 0.14801499247550964, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1968.0, + "completions/mean_length": 911.57421875, + "completions/mean_terminated_length": 894.56298828125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.3401360544217687, + "grad_norm": 0.009290636517107487, + "kl": 0.013303011655807495, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 38553438.0, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42695629596710205, + "rewards/thinking_verbosity_reward/mean": -1.002690315246582, + "rewards/thinking_verbosity_reward/std": 0.17424112558364868, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2520.0, + "completions/mean_length": 748.03515625, + "completions/mean_terminated_length": 738.921630859375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.3424036281179138, + "grad_norm": 0.01048612967133522, + "kl": 0.01896755420602858, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 38773791.0, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.467176616191864, + "rewards/thinking_verbosity_reward/mean": -0.9905790090560913, + "rewards/thinking_verbosity_reward/std": 0.22835561633110046, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3061.0, + "completions/mean_length": 856.52734375, + "completions/mean_terminated_length": 821.3611450195312, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.34467120181405897, + "grad_norm": 0.01100789662450552, + "kl": 0.016663537826389074, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 39024918.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -1.0369337797164917, + "rewards/thinking_verbosity_reward/std": 0.23979730904102325, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2784.0, + "completions/mean_length": 879.79296875, + "completions/mean_terminated_length": 853.7984619140625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "epoch": 0.3469387755102041, + "grad_norm": 0.009897883981466293, + "kl": 0.016834822949022055, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 39284473.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -1.0220832824707031, + "rewards/thinking_verbosity_reward/std": 0.17249906063079834, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2868.0, + "completions/mean_length": 945.90234375, + "completions/mean_terminated_length": 937.5647583007812, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.3492063492063492, + "grad_norm": 0.007220855914056301, + "kl": 0.016850179876200855, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 39557576.0, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.4604927599430084, + "rewards/thinking_verbosity_reward/mean": -1.0343188047409058, + "rewards/thinking_verbosity_reward/std": 0.14374901354312897, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2366.0, + "completions/mean_length": 899.1875, + "completions/mean_terminated_length": 838.1043701171875, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.35147392290249435, + "grad_norm": 0.010172298178076744, + "kl": 0.014446899527683854, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 39820056.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -0.9325083494186401, + "rewards/thinking_verbosity_reward/std": 0.16146185994148254, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2964.0, + "completions/max_terminated_length": 2964.0, + "completions/mean_length": 853.48828125, + "completions/mean_terminated_length": 853.48828125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.35374149659863946, + "grad_norm": 0.006909907329827547, + "kl": 0.013716616434976459, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 40073741.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0287976264953613, + "rewards/thinking_verbosity_reward/std": 0.14341773092746735, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2723.0, + "completions/mean_length": 928.93359375, + "completions/mean_terminated_length": 894.916748046875, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "epoch": 0.35600907029478457, + "grad_norm": 0.009901504032313824, + "kl": 0.015794382779859006, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 40345916.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.131295919418335, + "rewards/thinking_verbosity_reward/std": 0.2448146641254425, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2707.0, + "completions/max_terminated_length": 2707.0, + "completions/mean_length": 863.68359375, + "completions/mean_terminated_length": 863.68359375, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.35827664399092973, + "grad_norm": 0.00860818475484848, + "kl": 0.012366964132525027, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 40597659.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.0377671718597412, + "rewards/thinking_verbosity_reward/std": 0.1905304342508316, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2896.0, + "completions/mean_length": 847.1015625, + "completions/mean_terminated_length": 838.3765258789062, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.36054421768707484, + "grad_norm": 0.008556105196475983, + "kl": 0.01563124149106443, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 40844533.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -1.0081844329833984, + "rewards/thinking_verbosity_reward/std": 0.16405585408210754, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2620.0, + "completions/mean_length": 896.29296875, + "completions/mean_terminated_length": 861.7579956054688, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.36281179138321995, + "grad_norm": 0.007447344250977039, + "kl": 0.015082890167832375, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 41105776.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.0712705850601196, + "rewards/thinking_verbosity_reward/std": 0.2392042875289917, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2742.0, + "completions/mean_length": 952.05078125, + "completions/mean_terminated_length": 892.4537963867188, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.36507936507936506, + "grad_norm": 0.009558124467730522, + "kl": 0.013549065333791077, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 41381725.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -0.9774283170700073, + "rewards/thinking_verbosity_reward/std": 0.25084200501441956, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2933.0, + "completions/mean_length": 831.29296875, + "completions/mean_terminated_length": 813.6495971679688, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.3673469387755102, + "grad_norm": 0.008420453406870365, + "kl": 0.01494673069100827, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 41631256.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -1.0158720016479492, + "rewards/thinking_verbosity_reward/std": 0.22624461352825165, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 835.390625, + "completions/mean_terminated_length": 817.779541015625, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.36961451247165533, + "grad_norm": 0.010945898480713367, + "kl": 0.017732376232743263, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 41878116.0, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "rewards/thinking_verbosity_reward/mean": -0.9873806238174438, + "rewards/thinking_verbosity_reward/std": 0.1300235539674759, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1805.0, + "completions/mean_length": 784.0546875, + "completions/mean_terminated_length": 766.0393676757812, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.37188208616780044, + "grad_norm": 0.01134707871824503, + "kl": 0.018498236313462257, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 42111434.0, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.467176616191864, + "rewards/thinking_verbosity_reward/mean": -0.9369162917137146, + "rewards/thinking_verbosity_reward/std": 0.1752557009458542, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2317.0, + "completions/max_terminated_length": 2317.0, + "completions/mean_length": 791.65234375, + "completions/mean_terminated_length": 791.65234375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.3741496598639456, + "grad_norm": 0.009712261147797108, + "kl": 0.018894724315032363, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 42345161.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.0666654109954834, + "rewards/thinking_verbosity_reward/std": 0.18267840147018433, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2527.0, + "completions/max_terminated_length": 2527.0, + "completions/mean_length": 861.58984375, + "completions/mean_terminated_length": 861.58984375, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.3764172335600907, + "grad_norm": 0.010240369476377964, + "kl": 0.01745079201646149, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 42598240.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.036360263824463, + "rewards/thinking_verbosity_reward/std": 0.23908858001232147, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2917.0, + "completions/mean_length": 841.87890625, + "completions/mean_terminated_length": 815.434814453125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.3786848072562358, + "grad_norm": 0.011244615539908409, + "kl": 0.016731347423046827, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 42848049.0, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.467176616191864, + "rewards/thinking_verbosity_reward/mean": -0.9775382280349731, + "rewards/thinking_verbosity_reward/std": 0.22414962947368622, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2694.0, + "completions/mean_length": 886.328125, + "completions/mean_terminated_length": 860.4110717773438, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.38095238095238093, + "grad_norm": 0.007972556166350842, + "kl": 0.018314856686629355, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 43106365.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.0373663902282715, + "rewards/thinking_verbosity_reward/std": 0.16679736971855164, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2776.0, + "completions/mean_length": 877.64453125, + "completions/mean_terminated_length": 851.62451171875, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "epoch": 0.3832199546485261, + "grad_norm": 0.009399347007274628, + "kl": 0.015880511258728802, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 43362370.0, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24397502839565277, + "rewards/thinking_verbosity_reward/mean": -0.998458981513977, + "rewards/thinking_verbosity_reward/std": 0.16497254371643066, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2524.0, + "completions/mean_length": 726.5625, + "completions/mean_terminated_length": 698.7510375976562, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.3854875283446712, + "grad_norm": 0.014450881630182266, + "kl": 0.018910197424702346, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 43582042.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -0.9687459468841553, + "rewards/thinking_verbosity_reward/std": 0.12299178540706635, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 845.85546875, + "completions/mean_terminated_length": 837.1255493164062, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.3877551020408163, + "grad_norm": 0.011153717525303364, + "kl": 0.01964964799117297, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 43830493.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -1.0590170621871948, + "rewards/thinking_verbosity_reward/std": 0.13932372629642487, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2306.0, + "completions/mean_length": 866.921875, + "completions/mean_terminated_length": 858.2745361328125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.3900226757369615, + "grad_norm": 0.008990363217890263, + "kl": 0.01546497130766511, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 44084657.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -1.0729436874389648, + "rewards/thinking_verbosity_reward/std": 0.18463462591171265, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2965.0, + "completions/mean_length": 934.0546875, + "completions/mean_terminated_length": 873.9517822265625, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "epoch": 0.3922902494331066, + "grad_norm": 0.010577639564871788, + "kl": 0.019168772967532277, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 44355223.0, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49501484632492065, + "rewards/thinking_verbosity_reward/mean": -1.0507559776306152, + "rewards/thinking_verbosity_reward/std": 0.24755479395389557, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2625.0, + "completions/mean_length": 874.8359375, + "completions/mean_terminated_length": 857.535400390625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.3945578231292517, + "grad_norm": 0.011470607481896877, + "kl": 0.017078522709198296, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 44611325.0, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.4787135720252991, + "rewards/thinking_verbosity_reward/mean": -1.0353235006332397, + "rewards/thinking_verbosity_reward/std": 0.22302886843681335, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2334.0, + "completions/mean_length": 883.64453125, + "completions/mean_terminated_length": 840.0518188476562, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.3968253968253968, + "grad_norm": 0.009749037213623524, + "kl": 0.015647199819795787, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 44870978.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -0.9983662366867065, + "rewards/thinking_verbosity_reward/std": 0.19360637664794922, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2479.0, + "completions/mean_length": 811.56640625, + "completions/mean_terminated_length": 775.6865234375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.39909297052154197, + "grad_norm": 0.011813972145318985, + "kl": 0.020430682692676783, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 45114595.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -1.0237352848052979, + "rewards/thinking_verbosity_reward/std": 0.20367014408111572, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2354.0, + "completions/mean_length": 903.21875, + "completions/mean_terminated_length": 860.0159912109375, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.4013605442176871, + "grad_norm": 0.009808878414332867, + "kl": 0.015558319166302681, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 45378291.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -1.068803310394287, + "rewards/thinking_verbosity_reward/std": 0.2771081328392029, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2965.0, + "completions/mean_length": 915.2421875, + "completions/mean_terminated_length": 881.0079956054688, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.4036281179138322, + "grad_norm": 0.00999904703348875, + "kl": 0.01731885108165443, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 45644921.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -0.9761955738067627, + "rewards/thinking_verbosity_reward/std": 0.21187786757946014, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2872.0, + "completions/mean_length": 883.65625, + "completions/mean_terminated_length": 822.1365356445312, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.40589569160997735, + "grad_norm": 0.01157259289175272, + "kl": 0.015164659125730395, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 45901681.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -0.9666751623153687, + "rewards/thinking_verbosity_reward/std": 0.19454891979694366, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2309.0, + "completions/mean_length": 799.7421875, + "completions/mean_terminated_length": 781.8504028320312, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.40816326530612246, + "grad_norm": 0.00961134023964405, + "kl": 0.016474129050038755, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 46142207.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -0.9901701211929321, + "rewards/thinking_verbosity_reward/std": 0.18428485095500946, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3001.0, + "completions/mean_length": 929.9609375, + "completions/mean_terminated_length": 904.561279296875, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.41043083900226757, + "grad_norm": 0.00899998378008604, + "kl": 0.013840968487784266, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 46415893.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -1.0798110961914062, + "rewards/thinking_verbosity_reward/std": 0.2774321734905243, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2921.0, + "completions/mean_length": 773.0078125, + "completions/mean_terminated_length": 754.905517578125, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.4126984126984127, + "grad_norm": 0.009478210471570492, + "kl": 0.018688766518607736, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 46648447.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -0.9747356176376343, + "rewards/thinking_verbosity_reward/std": 0.2368944138288498, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2916.0, + "completions/mean_length": 812.36328125, + "completions/mean_terminated_length": 803.5020141601562, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.41496598639455784, + "grad_norm": 0.012250646948814392, + "kl": 0.01789340330287814, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 46891068.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -0.9874039888381958, + "rewards/thinking_verbosity_reward/std": 0.167362242937088, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1908.0, + "completions/mean_length": 893.7109375, + "completions/mean_terminated_length": 850.3187255859375, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.41723356009070295, + "grad_norm": 0.008382049389183521, + "kl": 0.019076674710959196, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 47153090.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -0.9647634029388428, + "rewards/thinking_verbosity_reward/std": 0.1518833339214325, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2559.0, + "completions/mean_length": 952.5703125, + "completions/mean_terminated_length": 892.9879150390625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.41950113378684806, + "grad_norm": 0.00960648711770773, + "kl": 0.013788689277134836, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 47431884.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -1.0704338550567627, + "rewards/thinking_verbosity_reward/std": 0.24710920453071594, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2469.0, + "completions/mean_length": 862.22265625, + "completions/mean_terminated_length": 853.556884765625, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "epoch": 0.4217687074829932, + "grad_norm": 0.011173815466463566, + "kl": 0.013377169729210436, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 47683333.0, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.467176616191864, + "rewards/thinking_verbosity_reward/mean": -0.9211025834083557, + "rewards/thinking_verbosity_reward/std": 0.15264762938022614, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2473.0, + "completions/mean_length": 921.02734375, + "completions/mean_terminated_length": 895.5217895507812, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.42403628117913833, + "grad_norm": 0.009727582335472107, + "kl": 0.01520753011573106, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 47954188.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.042634129524231, + "rewards/thinking_verbosity_reward/std": 0.20860502123832703, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3051.0, + "completions/mean_length": 956.21875, + "completions/mean_terminated_length": 905.4400634765625, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "epoch": 0.42630385487528344, + "grad_norm": 0.010211072862148285, + "kl": 0.014484946383163333, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 48233340.0, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24397502839565277, + "rewards/thinking_verbosity_reward/mean": -1.1893620491027832, + "rewards/thinking_verbosity_reward/std": 0.27693769335746765, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2824.0, + "completions/mean_length": 833.875, + "completions/mean_terminated_length": 798.3492431640625, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.42857142857142855, + "grad_norm": 0.010349276475608349, + "kl": 0.015537602012045681, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 48476212.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -0.9466103315353394, + "rewards/thinking_verbosity_reward/std": 0.18640074133872986, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2057.0, + "completions/max_terminated_length": 2057.0, + "completions/mean_length": 836.625, + "completions/mean_terminated_length": 836.625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.4308390022675737, + "grad_norm": 0.009273609146475792, + "kl": 0.018451889511197805, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 48721524.0, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.125, + "rewards/thinking_verbosity_reward/mean": -0.9888083338737488, + "rewards/thinking_verbosity_reward/std": 0.20332609117031097, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3047.0, + "completions/mean_length": 1028.453125, + "completions/mean_terminated_length": 953.991943359375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.4331065759637188, + "grad_norm": 0.010177548974752426, + "kl": 0.012185210129246116, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 49019192.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -1.050152063369751, + "rewards/thinking_verbosity_reward/std": 0.11916085332632065, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2445.0, + "completions/mean_length": 854.671875, + "completions/mean_terminated_length": 819.4762573242188, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.43537414965986393, + "grad_norm": 0.010393013246357441, + "kl": 0.02104986272752285, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 49272036.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -1.1220296621322632, + "rewards/thinking_verbosity_reward/std": 0.17715369164943695, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2914.0, + "completions/mean_length": 880.0703125, + "completions/mean_terminated_length": 854.0791015625, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.4376417233560091, + "grad_norm": 0.00841362401843071, + "kl": 0.016141690546646714, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 49524990.0, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.4787135720252991, + "rewards/thinking_verbosity_reward/mean": -1.1212995052337646, + "rewards/thinking_verbosity_reward/std": 0.20928777754306793, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2575.0, + "completions/max_terminated_length": 2575.0, + "completions/mean_length": 908.55859375, + "completions/mean_terminated_length": 908.55859375, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 0.4399092970521542, + "grad_norm": 0.009237721562385559, + "kl": 0.01505567308049649, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 49792757.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.014237880706787, + "rewards/thinking_verbosity_reward/std": 0.13566894829273224, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2964.0, + "completions/mean_length": 845.34765625, + "completions/mean_terminated_length": 818.9447021484375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.4421768707482993, + "grad_norm": 0.013051062822341919, + "kl": 0.018628071062266827, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 50038902.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -1.0244910717010498, + "rewards/thinking_verbosity_reward/std": 0.25901997089385986, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2928.0, + "completions/mean_length": 810.23828125, + "completions/mean_terminated_length": 783.4190063476562, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.4444444444444444, + "grad_norm": 0.010826957412064075, + "kl": 0.018433311954140663, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 50275491.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -0.9748213887214661, + "rewards/thinking_verbosity_reward/std": 0.2647187411785126, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2499.0, + "completions/mean_length": 822.14453125, + "completions/mean_terminated_length": 786.4325561523438, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.4467120181405896, + "grad_norm": 0.010616572573781013, + "kl": 0.01714473543688655, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 50517408.0, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42695629596710205, + "rewards/thinking_verbosity_reward/mean": -0.9253194332122803, + "rewards/thinking_verbosity_reward/std": 0.13301581144332886, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2432.0, + "completions/mean_length": 953.55859375, + "completions/mean_terminated_length": 928.4387817382812, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.4489795918367347, + "grad_norm": 0.012065291404724121, + "kl": 0.01519809348974377, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 50792255.0, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4917473793029785, + "rewards/thinking_verbosity_reward/mean": -1.0095070600509644, + "rewards/thinking_verbosity_reward/std": 0.1947650909423828, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2819.0, + "completions/mean_length": 978.3828125, + "completions/mean_terminated_length": 936.6773071289062, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "epoch": 0.4512471655328798, + "grad_norm": 0.010819735936820507, + "kl": 0.014236372313462198, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 51081449.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.0474004745483398, + "rewards/thinking_verbosity_reward/std": 0.174185112118721, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2715.0, + "completions/mean_length": 956.3203125, + "completions/mean_terminated_length": 852.2704467773438, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "epoch": 0.45351473922902497, + "grad_norm": 0.011566883884370327, + "kl": 0.018665405455976725, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 51356619.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -1.205664873123169, + "rewards/thinking_verbosity_reward/std": 0.2620229125022888, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1900.0, + "completions/mean_length": 857.3515625, + "completions/mean_terminated_length": 804.2000122070312, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "epoch": 0.4557823129251701, + "grad_norm": 0.012561334297060966, + "kl": 0.018475383054465055, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 51609125.0, + "rewards/accuracy_reward/mean": 0.4375, + "rewards/accuracy_reward/std": 0.5, + "rewards/thinking_verbosity_reward/mean": -0.9909151196479797, + "rewards/thinking_verbosity_reward/std": 0.16245025396347046, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2807.0, + "completions/mean_length": 897.88671875, + "completions/mean_terminated_length": 854.5776977539062, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.4580498866213152, + "grad_norm": 0.009446638636291027, + "kl": 0.01895881164819002, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 51873864.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -0.9506819248199463, + "rewards/thinking_verbosity_reward/std": 0.19657985866069794, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2754.0, + "completions/mean_length": 888.41015625, + "completions/mean_terminated_length": 862.517822265625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.4603174603174603, + "grad_norm": 0.010176736861467361, + "kl": 0.016433256911113858, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 52135129.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -0.9547950029373169, + "rewards/thinking_verbosity_reward/std": 0.24698041379451752, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2834.0, + "completions/mean_length": 930.0, + "completions/mean_terminated_length": 860.9031982421875, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.46258503401360546, + "grad_norm": 0.010058085434138775, + "kl": 0.017858052742667496, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 52405969.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -0.9141961932182312, + "rewards/thinking_verbosity_reward/std": 0.16567330062389374, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2548.0, + "completions/mean_length": 956.265625, + "completions/mean_terminated_length": 896.787109375, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "epoch": 0.46485260770975056, + "grad_norm": 0.010283946990966797, + "kl": 0.015081918914802372, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 52683253.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.0863326787948608, + "rewards/thinking_verbosity_reward/std": 0.179169163107872, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2805.0, + "completions/mean_length": 953.46875, + "completions/mean_terminated_length": 919.84130859375, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.4671201814058957, + "grad_norm": 0.009607678279280663, + "kl": 0.01725420611910522, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 52955925.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -1.070212960243225, + "rewards/thinking_verbosity_reward/std": 0.21249541640281677, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2528.0, + "completions/mean_length": 817.73046875, + "completions/mean_terminated_length": 781.948486328125, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 0.46938775510204084, + "grad_norm": 0.012886897660791874, + "kl": 0.021222397801466286, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 53197416.0, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.4604927599430084, + "rewards/thinking_verbosity_reward/mean": -1.0620214939117432, + "rewards/thinking_verbosity_reward/std": 0.22511275112628937, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2668.0, + "completions/max_terminated_length": 2668.0, + "completions/mean_length": 856.96484375, + "completions/mean_terminated_length": 856.96484375, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "epoch": 0.47165532879818595, + "grad_norm": 0.009554131887853146, + "kl": 0.01887553697451949, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 53448855.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.0204942226409912, + "rewards/thinking_verbosity_reward/std": 0.20653434097766876, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2235.0, + "completions/mean_length": 916.04296875, + "completions/mean_terminated_length": 873.0956420898438, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.47392290249433106, + "grad_norm": 0.009789001196622849, + "kl": 0.017955805640667677, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 53713378.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.0864450931549072, + "rewards/thinking_verbosity_reward/std": 0.1681724488735199, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2540.0, + "completions/mean_length": 816.61328125, + "completions/mean_terminated_length": 807.7686767578125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.47619047619047616, + "grad_norm": 0.012541405856609344, + "kl": 0.01750082685612142, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 53954255.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -0.9920581579208374, + "rewards/thinking_verbosity_reward/std": 0.15404701232910156, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2810.0, + "completions/mean_length": 869.859375, + "completions/mean_terminated_length": 834.90478515625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.47845804988662133, + "grad_norm": 0.01181892678141594, + "kl": 0.01731920801103115, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 54206971.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -0.9646005034446716, + "rewards/thinking_verbosity_reward/std": 0.2688433825969696, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2995.0, + "completions/mean_length": 943.91796875, + "completions/mean_terminated_length": 918.683837890625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 0.48072562358276644, + "grad_norm": 0.010188623331487179, + "kl": 0.019708609441295266, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 54480102.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -0.9490549564361572, + "rewards/thinking_verbosity_reward/std": 0.16570480167865753, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2648.0, + "completions/mean_length": 910.01953125, + "completions/mean_terminated_length": 822.1340942382812, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.48299319727891155, + "grad_norm": 0.008586824871599674, + "kl": 0.015290202456526458, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 54745059.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -1.020936131477356, + "rewards/thinking_verbosity_reward/std": 0.25687742233276367, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2638.0, + "completions/mean_length": 974.46484375, + "completions/mean_terminated_length": 915.4979858398438, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.4852607709750567, + "grad_norm": 0.008974796161055565, + "kl": 0.01757437875494361, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 55027594.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -1.023360013961792, + "rewards/thinking_verbosity_reward/std": 0.1848423033952713, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2650.0, + "completions/mean_length": 837.265625, + "completions/mean_terminated_length": 810.766845703125, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.4875283446712018, + "grad_norm": 0.010545222088694572, + "kl": 0.018052082043141127, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 55273798.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -0.9291400909423828, + "rewards/thinking_verbosity_reward/std": 0.19482260942459106, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2373.0, + "completions/mean_length": 785.828125, + "completions/mean_terminated_length": 758.7194213867188, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.4897959183673469, + "grad_norm": 0.011413714848458767, + "kl": 0.021833877777680755, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 55507618.0, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24397502839565277, + "rewards/thinking_verbosity_reward/mean": -1.0762391090393066, + "rewards/thinking_verbosity_reward/std": 0.17352086305618286, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2090.0, + "completions/mean_length": 877.5703125, + "completions/mean_terminated_length": 824.904052734375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.49206349206349204, + "grad_norm": 0.008202183060348034, + "kl": 0.018758163787424564, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 55768380.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.034774899482727, + "rewards/thinking_verbosity_reward/std": 0.22267931699752808, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2888.0, + "completions/mean_length": 876.234375, + "completions/mean_terminated_length": 858.9448852539062, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "epoch": 0.4943310657596372, + "grad_norm": 0.009659706614911556, + "kl": 0.015067456639371812, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 56024600.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -0.9955744743347168, + "rewards/thinking_verbosity_reward/std": 0.17310035228729248, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2603.0, + "completions/mean_length": 803.78515625, + "completions/mean_terminated_length": 794.8902587890625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.4965986394557823, + "grad_norm": 0.013116846792399883, + "kl": 0.01907317037694156, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 56267561.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -1.0039483308792114, + "rewards/thinking_verbosity_reward/std": 0.1465807408094406, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2737.0, + "completions/mean_length": 872.359375, + "completions/mean_terminated_length": 855.0393676757812, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.4988662131519274, + "grad_norm": 0.009577963501214981, + "kl": 0.018186470260843635, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 56524573.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0075783729553223, + "rewards/thinking_verbosity_reward/std": 0.17280547320842743, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3028.0, + "completions/mean_length": 962.6875, + "completions/mean_terminated_length": 894.6451416015625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.5011337868480725, + "grad_norm": 0.010678299702703953, + "kl": 0.018511016853153706, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 56800501.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -1.1356804370880127, + "rewards/thinking_verbosity_reward/std": 0.28086355328559875, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3026.0, + "completions/mean_length": 910.14453125, + "completions/mean_terminated_length": 893.1220703125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.5034013605442177, + "grad_norm": 0.010262486524879932, + "kl": 0.01926219160668552, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 57067458.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -1.0604560375213623, + "rewards/thinking_verbosity_reward/std": 0.18050743639469147, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2941.0, + "completions/mean_length": 879.82421875, + "completions/mean_terminated_length": 818.19677734375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.5056689342403629, + "grad_norm": 0.01337781734764576, + "kl": 0.014888972626067698, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 57326221.0, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "rewards/thinking_verbosity_reward/mean": -0.866876482963562, + "rewards/thinking_verbosity_reward/std": 0.2467440664768219, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3014.0, + "completions/mean_length": 793.77734375, + "completions/mean_terminated_length": 775.8385620117188, + "completions/min_length": 288.0, + "completions/min_terminated_length": 288.0, + "epoch": 0.5079365079365079, + "grad_norm": 0.0136335464194417, + "kl": 0.019439294235780835, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 57560460.0, + "rewards/accuracy_reward/mean": 0.484375, + "rewards/accuracy_reward/std": 0.5037065148353577, + "rewards/thinking_verbosity_reward/mean": -0.9557390213012695, + "rewards/thinking_verbosity_reward/std": 0.17615950107574463, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2616.0, + "completions/mean_length": 872.81640625, + "completions/mean_terminated_length": 846.7391357421875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.5102040816326531, + "grad_norm": 0.009327380917966366, + "kl": 0.017893899464979768, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 57813733.0, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.4787135720252991, + "rewards/thinking_verbosity_reward/mean": -0.9121633768081665, + "rewards/thinking_verbosity_reward/std": 0.17167750000953674, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2516.0, + "completions/mean_length": 892.61328125, + "completions/mean_terminated_length": 849.19921875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.5124716553287982, + "grad_norm": 0.013548355549573898, + "kl": 0.018114933976903558, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 58073746.0, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.4836103618144989, + "rewards/thinking_verbosity_reward/mean": -0.9720928072929382, + "rewards/thinking_verbosity_reward/std": 0.26396772265434265, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3045.0, + "completions/mean_length": 937.49609375, + "completions/mean_terminated_length": 886.26806640625, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.5147392290249433, + "grad_norm": 0.010222046636044979, + "kl": 0.02042914554476738, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 58349545.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -0.9903563261032104, + "rewards/thinking_verbosity_reward/std": 0.21816101670265198, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2886.0, + "completions/mean_length": 931.671875, + "completions/mean_terminated_length": 872.2088012695312, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.5170068027210885, + "grad_norm": 0.010179056786000729, + "kl": 0.015022416017018259, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 58622245.0, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.4787135720252991, + "rewards/thinking_verbosity_reward/mean": -1.131662368774414, + "rewards/thinking_verbosity_reward/std": 0.3688305616378784, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2461.0, + "completions/mean_length": 816.3671875, + "completions/mean_terminated_length": 789.62060546875, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.5192743764172335, + "grad_norm": 0.012237810529768467, + "kl": 0.018424668232910335, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 58865347.0, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49501484632492065, + "rewards/thinking_verbosity_reward/mean": -0.9203896522521973, + "rewards/thinking_verbosity_reward/std": 0.17915146052837372, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2128.0, + "completions/mean_length": 888.37109375, + "completions/mean_terminated_length": 853.7103881835938, + "completions/min_length": 331.0, + "completions/min_terminated_length": 331.0, + "epoch": 0.5215419501133787, + "grad_norm": 0.010966089554131031, + "kl": 0.014149946859106421, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 59127386.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.061596155166626, + "rewards/thinking_verbosity_reward/std": 0.20837153494358063, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2887.0, + "completions/mean_length": 849.46875, + "completions/mean_terminated_length": 831.968505859375, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "epoch": 0.5238095238095238, + "grad_norm": 0.01015971228480339, + "kl": 0.019778476329520345, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 59376802.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -0.9858565330505371, + "rewards/thinking_verbosity_reward/std": 0.23418447375297546, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2801.0, + "completions/mean_length": 877.89453125, + "completions/mean_terminated_length": 843.0675048828125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.5260770975056689, + "grad_norm": 0.008144108578562737, + "kl": 0.016903184936381876, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 59634783.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0368165969848633, + "rewards/thinking_verbosity_reward/std": 0.16309121251106262, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2650.0, + "completions/mean_length": 854.859375, + "completions/mean_terminated_length": 846.1647338867188, + "completions/min_length": 277.0, + "completions/min_terminated_length": 277.0, + "epoch": 0.528344671201814, + "grad_norm": 0.009617917239665985, + "kl": 0.019503187853842974, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 59892035.0, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "rewards/thinking_verbosity_reward/mean": -1.018017292022705, + "rewards/thinking_verbosity_reward/std": 0.18858323991298676, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2512.0, + "completions/mean_length": 805.796875, + "completions/mean_terminated_length": 796.9098510742188, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.5306122448979592, + "grad_norm": 0.012487702071666718, + "kl": 0.02097301627509296, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 60131567.0, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.4836103618144989, + "rewards/thinking_verbosity_reward/mean": -0.9372197389602661, + "rewards/thinking_verbosity_reward/std": 0.17001986503601074, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2700.0, + "completions/mean_length": 923.23828125, + "completions/mean_terminated_length": 862.831298828125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 0.5328798185941043, + "grad_norm": 0.010491319932043552, + "kl": 0.01767199533060193, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 60399380.0, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.467176616191864, + "rewards/thinking_verbosity_reward/mean": -0.8952465653419495, + "rewards/thinking_verbosity_reward/std": 0.16274677217006683, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2854.0, + "completions/mean_length": 997.83984375, + "completions/mean_terminated_length": 939.5300903320312, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.5351473922902494, + "grad_norm": 0.009466889314353466, + "kl": 0.015858184546232224, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 60687411.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -1.0495420694351196, + "rewards/thinking_verbosity_reward/std": 0.2152528613805771, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2085.0, + "completions/mean_length": 813.921875, + "completions/mean_terminated_length": 787.1463012695312, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.5374149659863946, + "grad_norm": 0.011397882364690304, + "kl": 0.020148760173469782, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 60931175.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -0.9736645221710205, + "rewards/thinking_verbosity_reward/std": 0.2628914713859558, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2898.0, + "completions/max_terminated_length": 2898.0, + "completions/mean_length": 877.1796875, + "completions/mean_terminated_length": 877.1796875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.5396825396825397, + "grad_norm": 0.009069756604731083, + "kl": 0.018483336665667593, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 61189437.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -1.032941222190857, + "rewards/thinking_verbosity_reward/std": 0.1608053743839264, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2746.0, + "completions/mean_length": 901.703125, + "completions/mean_terminated_length": 893.1921997070312, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.5419501133786848, + "grad_norm": 0.00949011743068695, + "kl": 0.01853169733658433, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 61453705.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.0403672456741333, + "rewards/thinking_verbosity_reward/std": 0.17530949413776398, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2671.0, + "completions/mean_length": 838.28125, + "completions/mean_terminated_length": 811.7944946289062, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.54421768707483, + "grad_norm": 0.010035160928964615, + "kl": 0.018584770848974586, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 61701153.0, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.4604927599430084, + "rewards/thinking_verbosity_reward/mean": -0.983948826789856, + "rewards/thinking_verbosity_reward/std": 0.18784740567207336, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3008.0, + "completions/mean_length": 867.140625, + "completions/mean_terminated_length": 805.1566162109375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.546485260770975, + "grad_norm": 0.012387460097670555, + "kl": 0.01664351497311145, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 61960277.0, + "rewards/accuracy_reward/mean": 0.46875, + "rewards/accuracy_reward/std": 0.5029674172401428, + "rewards/thinking_verbosity_reward/mean": -0.9387006759643555, + "rewards/thinking_verbosity_reward/std": 0.2057056427001953, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2984.0, + "completions/mean_length": 853.9296875, + "completions/mean_terminated_length": 827.6284790039062, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.5487528344671202, + "grad_norm": 0.011626170016825199, + "kl": 0.019829437136650085, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 62211595.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -1.1149451732635498, + "rewards/thinking_verbosity_reward/std": 0.20841148495674133, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2719.0, + "completions/mean_length": 890.20703125, + "completions/mean_terminated_length": 864.3359985351562, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.5510204081632653, + "grad_norm": 0.008997851982712746, + "kl": 0.02027058065868914, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 62470944.0, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42695629596710205, + "rewards/thinking_verbosity_reward/mean": -0.9811993837356567, + "rewards/thinking_verbosity_reward/std": 0.1601766049861908, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2175.0, + "completions/mean_length": 934.0, + "completions/mean_terminated_length": 873.8955688476562, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.5532879818594104, + "grad_norm": 0.009826377965509892, + "kl": 0.01771622523665428, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 62744904.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.003645420074463, + "rewards/thinking_verbosity_reward/std": 0.19293221831321716, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3024.0, + "completions/mean_length": 841.99609375, + "completions/mean_terminated_length": 797.57373046875, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "epoch": 0.5555555555555556, + "grad_norm": 0.011503640562295914, + "kl": 0.020586024271324277, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 62990383.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -0.9044971466064453, + "rewards/thinking_verbosity_reward/std": 0.19540897011756897, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2343.0, + "completions/mean_length": 910.03515625, + "completions/mean_terminated_length": 866.9681396484375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.5578231292517006, + "grad_norm": 0.009635476395487785, + "kl": 0.013897400349378586, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 63259936.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -1.045746088027954, + "rewards/thinking_verbosity_reward/std": 0.22148792445659637, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2707.0, + "completions/mean_length": 956.8046875, + "completions/mean_terminated_length": 923.230224609375, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.5600907029478458, + "grad_norm": 0.009647101163864136, + "kl": 0.015565865091048181, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 63541470.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -1.081800937652588, + "rewards/thinking_verbosity_reward/std": 0.2681126892566681, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2536.0, + "completions/mean_length": 882.69140625, + "completions/mean_terminated_length": 856.7312622070312, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.562358276643991, + "grad_norm": 0.010811948217451572, + "kl": 0.018627667566761374, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 63798991.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.014650821685791, + "rewards/thinking_verbosity_reward/std": 0.1856333613395691, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2972.0, + "completions/mean_length": 825.62890625, + "completions/mean_terminated_length": 816.8196411132812, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.564625850340136, + "grad_norm": 0.012359336949884892, + "kl": 0.022031506756320596, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 64045280.0, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.4787135720252991, + "rewards/thinking_verbosity_reward/mean": -1.0271450281143188, + "rewards/thinking_verbosity_reward/std": 0.15902872383594513, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 832.27734375, + "completions/mean_terminated_length": 805.7194213867188, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.5668934240362812, + "grad_norm": 0.013162931427359581, + "kl": 0.019426542334258556, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 64294151.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -1.0144970417022705, + "rewards/thinking_verbosity_reward/std": 0.16285768151283264, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2725.0, + "completions/mean_length": 886.75390625, + "completions/mean_terminated_length": 843.22314453125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.5691609977324263, + "grad_norm": 0.010557233355939388, + "kl": 0.01817580289207399, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 64555504.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -1.0614598989486694, + "rewards/thinking_verbosity_reward/std": 0.27215760946273804, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2715.0, + "completions/mean_length": 923.90234375, + "completions/mean_terminated_length": 889.8056030273438, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "epoch": 0.5714285714285714, + "grad_norm": 0.008848571218550205, + "kl": 0.02090651378966868, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 64825359.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -0.992445170879364, + "rewards/thinking_verbosity_reward/std": 0.16101714968681335, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3045.0, + "completions/mean_length": 912.2265625, + "completions/mean_terminated_length": 877.9445190429688, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.5736961451247166, + "grad_norm": 0.009608199819922447, + "kl": 0.01735127787105739, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 65091985.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -0.9541873931884766, + "rewards/thinking_verbosity_reward/std": 0.2028442919254303, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2356.0, + "completions/mean_length": 867.84375, + "completions/mean_terminated_length": 850.4881591796875, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "epoch": 0.5759637188208617, + "grad_norm": 0.009549538604915142, + "kl": 0.020305058686062694, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 65347273.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.0338647365570068, + "rewards/thinking_verbosity_reward/std": 0.21350876986980438, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1955.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 846.93359375, + "completions/mean_terminated_length": 846.93359375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.5782312925170068, + "grad_norm": 0.011227017268538475, + "kl": 0.018905576318502426, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 65595976.0, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42695629596710205, + "rewards/thinking_verbosity_reward/mean": -1.0319279432296753, + "rewards/thinking_verbosity_reward/std": 0.1482962965965271, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2638.0, + "completions/mean_length": 988.875, + "completions/mean_terminated_length": 938.8800659179688, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "epoch": 0.5804988662131519, + "grad_norm": 0.011240279302001, + "kl": 0.018975541926920414, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 65880536.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -1.1208958625793457, + "rewards/thinking_verbosity_reward/std": 0.1762673258781433, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2800.0, + "completions/mean_length": 879.11328125, + "completions/mean_terminated_length": 853.1107177734375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.5827664399092971, + "grad_norm": 0.01031454000622034, + "kl": 0.01868701702915132, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 66136373.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -0.9758046865463257, + "rewards/thinking_verbosity_reward/std": 0.1795954704284668, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2199.0, + "completions/max_terminated_length": 2199.0, + "completions/mean_length": 858.7421875, + "completions/mean_terminated_length": 858.7421875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.5850340136054422, + "grad_norm": 0.01018265075981617, + "kl": 0.01880666718352586, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 66388907.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -0.9635556936264038, + "rewards/thinking_verbosity_reward/std": 0.12692080438137054, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2328.0, + "completions/mean_length": 855.58984375, + "completions/mean_terminated_length": 829.308349609375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.5873015873015873, + "grad_norm": 0.008613046258687973, + "kl": 0.020097772823646665, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 66642202.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -0.976679801940918, + "rewards/thinking_verbosity_reward/std": 0.14707103371620178, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2330.0, + "completions/mean_length": 885.58203125, + "completions/mean_terminated_length": 877.0078735351562, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.5895691609977324, + "grad_norm": 0.011235913261771202, + "kl": 0.018165227491408587, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 66901407.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -1.0549300909042358, + "rewards/thinking_verbosity_reward/std": 0.16321846842765808, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2923.0, + "completions/mean_length": 984.546875, + "completions/mean_terminated_length": 934.4480590820312, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.5918367346938775, + "grad_norm": 0.0077292947098612785, + "kl": 0.01922546187415719, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 67184771.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -1.0173218250274658, + "rewards/thinking_verbosity_reward/std": 0.16931739449501038, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2660.0, + "completions/mean_length": 827.578125, + "completions/mean_terminated_length": 764.4818725585938, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.5941043083900227, + "grad_norm": 0.012787089683115482, + "kl": 0.020877547096461058, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 67428207.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.0398929119110107, + "rewards/thinking_verbosity_reward/std": 0.32803812623023987, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2657.0, + "completions/mean_length": 903.5390625, + "completions/mean_terminated_length": 895.0353393554688, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.5963718820861678, + "grad_norm": 0.009455646388232708, + "kl": 0.018661829875782132, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 67691969.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -1.0347882509231567, + "rewards/thinking_verbosity_reward/std": 0.20681467652320862, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2618.0, + "completions/mean_length": 839.828125, + "completions/mean_terminated_length": 822.251953125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 0.5986394557823129, + "grad_norm": 0.012901975773274899, + "kl": 0.019611574476584792, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 67938805.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -0.9555234909057617, + "rewards/thinking_verbosity_reward/std": 0.15347255766391754, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2634.0, + "completions/mean_length": 909.25, + "completions/mean_terminated_length": 883.6047973632812, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "epoch": 0.6009070294784581, + "grad_norm": 0.011896652169525623, + "kl": 0.020362972281873226, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 68203293.0, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.467176616191864, + "rewards/thinking_verbosity_reward/mean": -0.9837641716003418, + "rewards/thinking_verbosity_reward/std": 0.2290009707212448, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2875.0, + "completions/mean_length": 866.2578125, + "completions/mean_terminated_length": 804.2489624023438, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 0.6031746031746031, + "grad_norm": 0.01137470081448555, + "kl": 0.020645360462367535, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 68456855.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -1.0125796794891357, + "rewards/thinking_verbosity_reward/std": 0.26441490650177, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2531.0, + "completions/mean_length": 898.76953125, + "completions/mean_terminated_length": 873.015869140625, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.6054421768707483, + "grad_norm": 0.011303579434752464, + "kl": 0.020650713704526424, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 68722228.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -0.9931071400642395, + "rewards/thinking_verbosity_reward/std": 0.16118338704109192, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2903.0, + "completions/mean_length": 882.05078125, + "completions/mean_terminated_length": 873.4627685546875, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.6077097505668935, + "grad_norm": 0.010899068787693977, + "kl": 0.02008616807870567, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 68980929.0, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "rewards/thinking_verbosity_reward/mean": -0.939687967300415, + "rewards/thinking_verbosity_reward/std": 0.1735316962003708, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2434.0, + "completions/max_terminated_length": 2434.0, + "completions/mean_length": 780.4296875, + "completions/mean_terminated_length": 780.4296875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.6099773242630385, + "grad_norm": 0.01137396041303873, + "kl": 0.018313308828510344, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 69212119.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -0.9517003297805786, + "rewards/thinking_verbosity_reward/std": 0.237481951713562, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2581.0, + "completions/mean_length": 810.35546875, + "completions/mean_terminated_length": 774.4564208984375, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "epoch": 0.6122448979591837, + "grad_norm": 0.01019532885402441, + "kl": 0.02349461684934795, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 69451698.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -0.9625643491744995, + "rewards/thinking_verbosity_reward/std": 0.12891879677772522, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2871.0, + "completions/mean_length": 909.76171875, + "completions/mean_terminated_length": 866.6892700195312, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.6145124716553289, + "grad_norm": 0.011482106521725655, + "kl": 0.018015810986980796, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 69719429.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -1.1086618900299072, + "rewards/thinking_verbosity_reward/std": 0.21308767795562744, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3065.0, + "completions/mean_length": 820.8984375, + "completions/mean_terminated_length": 794.20556640625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.6167800453514739, + "grad_norm": 0.013037718832492828, + "kl": 0.02113636559806764, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 69962755.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -0.9496555328369141, + "rewards/thinking_verbosity_reward/std": 0.1870565116405487, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2881.0, + "completions/mean_length": 893.921875, + "completions/mean_terminated_length": 841.6480102539062, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.6190476190476191, + "grad_norm": 0.01225859671831131, + "kl": 0.022405725438147783, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 70223839.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -0.9976888298988342, + "rewards/thinking_verbosity_reward/std": 0.17471018433570862, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2461.0, + "completions/mean_length": 845.375, + "completions/mean_terminated_length": 827.842529296875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.6213151927437641, + "grad_norm": 0.012705322355031967, + "kl": 0.024155253544449806, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 70467359.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -0.9544304609298706, + "rewards/thinking_verbosity_reward/std": 0.2659962475299835, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2928.0, + "completions/mean_length": 912.984375, + "completions/mean_terminated_length": 861.1680297851562, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.6235827664399093, + "grad_norm": 0.011366760358214378, + "kl": 0.02076743496581912, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 70732267.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -1.0417296886444092, + "rewards/thinking_verbosity_reward/std": 0.2999908924102783, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2439.0, + "completions/mean_length": 828.59375, + "completions/mean_terminated_length": 802.7352294921875, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "epoch": 0.6258503401360545, + "grad_norm": 0.010333663783967495, + "kl": 0.02214441355317831, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 70978499.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -0.874032735824585, + "rewards/thinking_verbosity_reward/std": 0.15739814937114716, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2724.0, + "completions/mean_length": 832.52734375, + "completions/mean_terminated_length": 805.9723510742188, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.6281179138321995, + "grad_norm": 0.011361267417669296, + "kl": 0.024048476247116923, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 71224458.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.0449676513671875, + "rewards/thinking_verbosity_reward/std": 0.20939169824123383, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2832.0, + "completions/mean_length": 881.6328125, + "completions/mean_terminated_length": 864.3858032226562, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "epoch": 0.6303854875283447, + "grad_norm": 0.009457490406930447, + "kl": 0.02115093218162656, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 71481652.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -1.0649299621582031, + "rewards/thinking_verbosity_reward/std": 0.0907767042517662, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2943.0, + "completions/mean_length": 864.88671875, + "completions/mean_terminated_length": 838.7154541015625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.6326530612244898, + "grad_norm": 0.012513005174696445, + "kl": 0.02190752769820392, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 71734159.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -0.93283611536026, + "rewards/thinking_verbosity_reward/std": 0.21806417405605316, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3036.0, + "completions/mean_length": 929.55859375, + "completions/mean_terminated_length": 904.1541748046875, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.6349206349206349, + "grad_norm": 0.011804268695414066, + "kl": 0.01918274478521198, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 72004814.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -0.9843831062316895, + "rewards/thinking_verbosity_reward/std": 0.18053489923477173, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2699.0, + "completions/mean_length": 840.6015625, + "completions/mean_terminated_length": 796.1514282226562, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 0.63718820861678, + "grad_norm": 0.012179218232631683, + "kl": 0.019511292688548565, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 72252272.0, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42695629596710205, + "rewards/thinking_verbosity_reward/mean": -1.0425779819488525, + "rewards/thinking_verbosity_reward/std": 0.3055434823036194, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 894.0390625, + "completions/mean_terminated_length": 841.76806640625, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.6394557823129252, + "grad_norm": 0.01097246166318655, + "kl": 0.02153337118215859, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 72513274.0, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24397502839565277, + "rewards/thinking_verbosity_reward/mean": -1.0385773181915283, + "rewards/thinking_verbosity_reward/std": 0.19707606732845306, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2281.0, + "completions/mean_length": 879.421875, + "completions/mean_terminated_length": 870.8236083984375, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "epoch": 0.6417233560090703, + "grad_norm": 0.010211371816694736, + "kl": 0.019702170742675662, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 72775118.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -0.9322717785835266, + "rewards/thinking_verbosity_reward/std": 0.1801954060792923, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2713.0, + "completions/mean_length": 977.2578125, + "completions/mean_terminated_length": 900.9312133789062, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "epoch": 0.6439909297052154, + "grad_norm": 0.010244780220091343, + "kl": 0.020986260613426566, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 73056000.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -1.213639259338379, + "rewards/thinking_verbosity_reward/std": 0.2808181345462799, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2890.0, + "completions/mean_length": 954.3515625, + "completions/mean_terminated_length": 841.0617065429688, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.6462585034013606, + "grad_norm": 0.008495507761836052, + "kl": 0.02124434500001371, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 73334002.0, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17536810040473938, + "rewards/thinking_verbosity_reward/mean": -1.0726574659347534, + "rewards/thinking_verbosity_reward/std": 0.3508376479148865, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2949.0, + "completions/mean_length": 831.390625, + "completions/mean_terminated_length": 768.4015502929688, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "epoch": 0.6485260770975056, + "grad_norm": 0.012875737622380257, + "kl": 0.0259325266815722, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 73578038.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.0057278871536255, + "rewards/thinking_verbosity_reward/std": 0.25236135721206665, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2967.0, + "completions/mean_length": 931.4140625, + "completions/mean_terminated_length": 897.4365844726562, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.6507936507936508, + "grad_norm": 0.011610771529376507, + "kl": 0.02063607983291149, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 73849920.0, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49501484632492065, + "rewards/thinking_verbosity_reward/mean": -1.0926625728607178, + "rewards/thinking_verbosity_reward/std": 0.2923538386821747, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2891.0, + "completions/mean_length": 908.3828125, + "completions/mean_terminated_length": 882.727294921875, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.6530612244897959, + "grad_norm": 0.011812858283519745, + "kl": 0.018927664728835225, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 74114082.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -1.0053209066390991, + "rewards/thinking_verbosity_reward/std": 0.2373671680688858, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2737.0, + "completions/mean_length": 900.84765625, + "completions/mean_terminated_length": 875.102783203125, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.655328798185941, + "grad_norm": 0.010036222636699677, + "kl": 0.02145698433741927, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 74378139.0, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -1.120572805404663, + "rewards/thinking_verbosity_reward/std": 0.2502773702144623, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2977.0, + "completions/mean_length": 941.03515625, + "completions/mean_terminated_length": 872.2943115234375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.6575963718820862, + "grad_norm": 0.012807481922209263, + "kl": 0.021023445995524526, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 74649548.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.0984432697296143, + "rewards/thinking_verbosity_reward/std": 0.23491103947162628, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2905.0, + "completions/mean_length": 897.98046875, + "completions/mean_terminated_length": 872.2015991210938, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.6598639455782312, + "grad_norm": 0.011617010459303856, + "kl": 0.01822663308121264, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 74912735.0, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24397502839565277, + "rewards/thinking_verbosity_reward/mean": -1.0392165184020996, + "rewards/thinking_verbosity_reward/std": 0.1826552003622055, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2856.0, + "completions/mean_length": 990.79296875, + "completions/mean_terminated_length": 914.9595336914062, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.6621315192743764, + "grad_norm": 0.011679367162287235, + "kl": 0.017092216294258833, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 75198834.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -1.006075143814087, + "rewards/thinking_verbosity_reward/std": 0.23654350638389587, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2638.0, + "completions/mean_length": 848.80859375, + "completions/mean_terminated_length": 822.4466552734375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.6643990929705216, + "grad_norm": 0.013492857106029987, + "kl": 0.01990956231020391, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 75452697.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -1.0637507438659668, + "rewards/thinking_verbosity_reward/std": 0.19302506744861603, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2873.0, + "completions/mean_length": 887.43359375, + "completions/mean_terminated_length": 870.2322998046875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.6666666666666666, + "grad_norm": 0.010862684808671474, + "kl": 0.022151886951178312, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 75715168.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -0.9618045687675476, + "rewards/thinking_verbosity_reward/std": 0.12650032341480255, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2171.0, + "completions/mean_length": 747.15234375, + "completions/mean_terminated_length": 738.0353393554688, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.6689342403628118, + "grad_norm": 0.012309734709560871, + "kl": 0.02314292755909264, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 75938623.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -0.91550213098526, + "rewards/thinking_verbosity_reward/std": 0.2006446123123169, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3047.0, + "completions/mean_length": 903.2421875, + "completions/mean_terminated_length": 842.2730712890625, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "epoch": 0.671201814058957, + "grad_norm": 0.010398944839835167, + "kl": 0.02157586091198027, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 76207293.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -0.9636365175247192, + "rewards/thinking_verbosity_reward/std": 0.14042963087558746, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3012.0, + "completions/mean_length": 940.9765625, + "completions/mean_terminated_length": 889.83203125, + "completions/min_length": 285.0, + "completions/min_terminated_length": 285.0, + "epoch": 0.673469387755102, + "grad_norm": 0.0093245143070817, + "kl": 0.01960503263399005, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 76480423.0, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.17536810040473938, + "rewards/thinking_verbosity_reward/mean": -1.1109051704406738, + "rewards/thinking_verbosity_reward/std": 0.25085359811782837, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3025.0, + "completions/mean_length": 825.54296875, + "completions/mean_terminated_length": 780.7928466796875, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "epoch": 0.6757369614512472, + "grad_norm": 0.00987502746284008, + "kl": 0.021809031954035163, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 76725962.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.1279023885726929, + "rewards/thinking_verbosity_reward/std": 0.33235612511634827, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2516.0, + "completions/mean_length": 883.8515625, + "completions/mean_terminated_length": 813.26611328125, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.6780045351473923, + "grad_norm": 0.013770343735814095, + "kl": 0.026639383053407073, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 76980124.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.1390994787216187, + "rewards/thinking_verbosity_reward/std": 0.3108987808227539, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3008.0, + "completions/mean_length": 1020.9296875, + "completions/mean_terminated_length": 963.26904296875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.6802721088435374, + "grad_norm": 0.009802062064409256, + "kl": 0.018902174197137356, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 77272434.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -1.1046098470687866, + "rewards/thinking_verbosity_reward/std": 0.2445298731327057, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2866.0, + "completions/mean_length": 999.66796875, + "completions/mean_terminated_length": 924.1578979492188, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "epoch": 0.6825396825396826, + "grad_norm": 0.011629792861640453, + "kl": 0.019971825648099184, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 77557085.0, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42695629596710205, + "rewards/thinking_verbosity_reward/mean": -1.0963420867919922, + "rewards/thinking_verbosity_reward/std": 0.2776513397693634, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2473.0, + "completions/mean_length": 915.04296875, + "completions/mean_terminated_length": 845.4636840820312, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.6848072562358276, + "grad_norm": 0.012851638719439507, + "kl": 0.02004775428213179, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 77823784.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.029089331626892, + "rewards/thinking_verbosity_reward/std": 0.17297162115573883, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2935.0, + "completions/mean_length": 1044.34375, + "completions/mean_terminated_length": 995.6800537109375, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "epoch": 0.6870748299319728, + "grad_norm": 0.01152093056589365, + "kl": 0.016170787857845426, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 78122024.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -1.1210309267044067, + "rewards/thinking_verbosity_reward/std": 0.27925747632980347, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2914.0, + "completions/mean_length": 928.0625, + "completions/mean_terminated_length": 867.7911376953125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.6893424036281179, + "grad_norm": 0.007436100859194994, + "kl": 0.02098172763362527, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 78392504.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -1.0652351379394531, + "rewards/thinking_verbosity_reward/std": 0.24608182907104492, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3064.0, + "completions/mean_length": 974.8984375, + "completions/mean_terminated_length": 933.12353515625, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.691609977324263, + "grad_norm": 0.01059749349951744, + "kl": 0.019479941576719284, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 78673006.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0294917821884155, + "rewards/thinking_verbosity_reward/std": 0.30255234241485596, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3070.0, + "completions/mean_length": 875.08984375, + "completions/mean_terminated_length": 795.04052734375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "epoch": 0.6938775510204082, + "grad_norm": 0.012975899502635002, + "kl": 0.024097177665680647, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 78929965.0, + "rewards/accuracy_reward/mean": 0.421875, + "rewards/accuracy_reward/std": 0.49776285886764526, + "rewards/thinking_verbosity_reward/mean": -1.0024305582046509, + "rewards/thinking_verbosity_reward/std": 0.2893208861351013, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2724.0, + "completions/mean_length": 875.0546875, + "completions/mean_terminated_length": 822.3280639648438, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.6961451247165533, + "grad_norm": 0.009661686606705189, + "kl": 0.01823381637223065, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 79188019.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -0.8731059432029724, + "rewards/thinking_verbosity_reward/std": 0.17691829800605774, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2654.0, + "completions/mean_length": 818.0390625, + "completions/mean_terminated_length": 791.312255859375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.6984126984126984, + "grad_norm": 0.012873414903879166, + "kl": 0.021295580314472318, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 79431365.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -1.0717928409576416, + "rewards/thinking_verbosity_reward/std": 0.2715516686439514, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2699.0, + "completions/mean_length": 890.90625, + "completions/mean_terminated_length": 847.4581909179688, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "epoch": 0.7006802721088435, + "grad_norm": 0.012073645368218422, + "kl": 0.020797141129150987, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 79696117.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -0.9974300861358643, + "rewards/thinking_verbosity_reward/std": 0.108637236058712, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2903.0, + "completions/mean_length": 842.4375, + "completions/mean_terminated_length": 816.0000610351562, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "epoch": 0.7029478458049887, + "grad_norm": 0.010629677213728428, + "kl": 0.021952383453026414, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 79941709.0, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.467176616191864, + "rewards/thinking_verbosity_reward/mean": -0.9437144994735718, + "rewards/thinking_verbosity_reward/std": 0.16069242358207703, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2640.0, + "completions/mean_length": 878.796875, + "completions/mean_terminated_length": 861.5275268554688, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "epoch": 0.7052154195011338, + "grad_norm": 0.010162542574107647, + "kl": 0.020370794693008065, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 80199417.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -0.9995274543762207, + "rewards/thinking_verbosity_reward/std": 0.22679492831230164, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2748.0, + "completions/mean_length": 945.359375, + "completions/mean_terminated_length": 858.9105224609375, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "epoch": 0.7074829931972789, + "grad_norm": 0.012234157882630825, + "kl": 0.02211676025763154, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 80469693.0, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48795005679130554, + "rewards/thinking_verbosity_reward/mean": -1.0178935527801514, + "rewards/thinking_verbosity_reward/std": 0.3230345845222473, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2003.0, + "completions/mean_length": 783.296875, + "completions/mean_terminated_length": 728.3680419921875, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.7097505668934241, + "grad_norm": 0.01134686078876257, + "kl": 0.023076176643371582, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 80702721.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -1.000784158706665, + "rewards/thinking_verbosity_reward/std": 0.1910533905029297, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2589.0, + "completions/mean_length": 867.6796875, + "completions/mean_terminated_length": 814.7760620117188, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.7120181405895691, + "grad_norm": 0.014482229016721249, + "kl": 0.022223971551284194, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 80957447.0, + "rewards/accuracy_reward/mean": 0.40625, + "rewards/accuracy_reward/std": 0.49501484632492065, + "rewards/thinking_verbosity_reward/mean": -1.0495896339416504, + "rewards/thinking_verbosity_reward/std": 0.32986608147621155, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2602.0, + "completions/mean_length": 903.8359375, + "completions/mean_terminated_length": 878.1265258789062, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.7142857142857143, + "grad_norm": 0.009476722218096256, + "kl": 0.02250252035446465, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 81220221.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -1.0043216943740845, + "rewards/thinking_verbosity_reward/std": 0.16453982889652252, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2215.0, + "completions/mean_length": 906.0546875, + "completions/mean_terminated_length": 836.1854858398438, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.7165532879818595, + "grad_norm": 0.01288322452455759, + "kl": 0.020796943921595812, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 81483347.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -0.972916841506958, + "rewards/thinking_verbosity_reward/std": 0.17860597372055054, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2846.0, + "completions/mean_length": 965.0625, + "completions/mean_terminated_length": 879.4146118164062, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "epoch": 0.7188208616780045, + "grad_norm": 0.010013082064688206, + "kl": 0.020892267813906074, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 81762371.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -1.056807518005371, + "rewards/thinking_verbosity_reward/std": 0.26680320501327515, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1833.0, + "completions/mean_length": 770.44140625, + "completions/mean_terminated_length": 743.1502075195312, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.7210884353741497, + "grad_norm": 0.013360545039176941, + "kl": 0.02220750949345529, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 81990876.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.0514169931411743, + "rewards/thinking_verbosity_reward/std": 0.2271958887577057, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2772.0, + "completions/mean_length": 902.86328125, + "completions/mean_terminated_length": 885.783447265625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.7233560090702947, + "grad_norm": 0.01136589702218771, + "kl": 0.024247736437246203, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 82254033.0, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48795005679130554, + "rewards/thinking_verbosity_reward/mean": -0.9477270841598511, + "rewards/thinking_verbosity_reward/std": 0.16990022361278534, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2164.0, + "completions/mean_length": 873.515625, + "completions/mean_terminated_length": 829.7211303710938, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.7256235827664399, + "grad_norm": 0.012802891433238983, + "kl": 0.0237765999045223, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 82509453.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.017584204673767, + "rewards/thinking_verbosity_reward/std": 0.12113430351018906, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2174.0, + "completions/mean_length": 830.6015625, + "completions/mean_terminated_length": 776.8080444335938, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.7278911564625851, + "grad_norm": 0.012293724343180656, + "kl": 0.0218400324229151, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 82755583.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -0.9463956356048584, + "rewards/thinking_verbosity_reward/std": 0.14419758319854736, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2878.0, + "completions/mean_length": 893.875, + "completions/mean_terminated_length": 850.486083984375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "epoch": 0.7301587301587301, + "grad_norm": 0.011995355598628521, + "kl": 0.022647992707788944, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 83016287.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -1.0414869785308838, + "rewards/thinking_verbosity_reward/std": 0.1384163498878479, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2799.0, + "completions/mean_length": 922.55859375, + "completions/mean_terminated_length": 879.7410888671875, + "completions/min_length": 280.0, + "completions/min_terminated_length": 280.0, + "epoch": 0.7324263038548753, + "grad_norm": 0.013697410933673382, + "kl": 0.020713827339932323, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 83282862.0, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4531635046005249, + "rewards/thinking_verbosity_reward/mean": -1.0619264841079712, + "rewards/thinking_verbosity_reward/std": 0.23914454877376556, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3019.0, + "completions/mean_length": 949.11328125, + "completions/mean_terminated_length": 898.1640625, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "epoch": 0.7346938775510204, + "grad_norm": 0.009018144570291042, + "kl": 0.020933196181431413, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 83558195.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -1.0263524055480957, + "rewards/thinking_verbosity_reward/std": 0.2151019424200058, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2993.0, + "completions/mean_length": 939.46484375, + "completions/mean_terminated_length": 879.5140380859375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "epoch": 0.7369614512471655, + "grad_norm": 0.01184019073843956, + "kl": 0.021151390857994556, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 83832834.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0577847957611084, + "rewards/thinking_verbosity_reward/std": 0.22137169539928436, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2841.0, + "completions/mean_length": 857.9375, + "completions/mean_terminated_length": 831.683837890625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.7392290249433107, + "grad_norm": 0.01276103500276804, + "kl": 0.024785081390291452, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 84083626.0, + "rewards/accuracy_reward/mean": 0.375, + "rewards/accuracy_reward/std": 0.48795005679130554, + "rewards/thinking_verbosity_reward/mean": -1.0117533206939697, + "rewards/thinking_verbosity_reward/std": 0.14836278557777405, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 2900.0, + "completions/max_terminated_length": 2900.0, + "completions/mean_length": 953.71484375, + "completions/mean_terminated_length": 946.2745361328125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 0.7414965986394558, + "grad_norm": 0.010525109246373177, + "kl": 0.024311045184731483, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 84361281.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.0332300662994385, + "rewards/thinking_verbosity_reward/std": 0.230626180768013, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2943.0, + "completions/mean_length": 954.88671875, + "completions/mean_terminated_length": 886.5927124023438, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "epoch": 0.7437641723356009, + "grad_norm": 0.012650595977902412, + "kl": 0.02178082033060491, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 84639428.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -1.0726279020309448, + "rewards/thinking_verbosity_reward/std": 0.32638081908226013, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2260.0, + "completions/mean_length": 980.8125, + "completions/mean_terminated_length": 930.6240234375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.746031746031746, + "grad_norm": 0.012714527547359467, + "kl": 0.02033833460882306, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 84922580.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -1.114803433418274, + "rewards/thinking_verbosity_reward/std": 0.17521648108959198, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1847.0, + "completions/mean_length": 884.83984375, + "completions/mean_terminated_length": 850.123046875, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.7482993197278912, + "grad_norm": 0.013733788393437862, + "kl": 0.020902864867821336, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 85183163.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -0.9728517532348633, + "rewards/thinking_verbosity_reward/std": 0.204330176115036, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2502.0, + "completions/mean_length": 859.1484375, + "completions/mean_terminated_length": 815.0677490234375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.7505668934240363, + "grad_norm": 0.011745438911020756, + "kl": 0.020472461124882102, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 85437041.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -1.0405341386795044, + "rewards/thinking_verbosity_reward/std": 0.26611489057540894, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2914.0, + "completions/mean_length": 852.1171875, + "completions/mean_terminated_length": 825.7944946289062, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.7528344671201814, + "grad_norm": 0.013163256458938122, + "kl": 0.02664191392250359, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 85686591.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.0226783752441406, + "rewards/thinking_verbosity_reward/std": 0.2356773465871811, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3052.0, + "completions/mean_length": 949.015625, + "completions/mean_terminated_length": 898.0640258789062, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "epoch": 0.7551020408163265, + "grad_norm": 0.01223345473408699, + "kl": 0.020331664476543665, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 85961435.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.0425175428390503, + "rewards/thinking_verbosity_reward/std": 0.21267595887184143, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2896.0, + "completions/mean_length": 881.38671875, + "completions/mean_terminated_length": 846.6151123046875, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.7573696145124716, + "grad_norm": 0.01158929243683815, + "kl": 0.02156754839234054, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 86219246.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -0.9513006210327148, + "rewards/thinking_verbosity_reward/std": 0.14863063395023346, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2648.0, + "completions/mean_length": 895.5390625, + "completions/mean_terminated_length": 860.9921264648438, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "epoch": 0.7596371882086168, + "grad_norm": 0.010920090600848198, + "kl": 0.022397283231839538, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 86481912.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -1.0625889301300049, + "rewards/thinking_verbosity_reward/std": 0.27037325501441956, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2514.0, + "completions/mean_length": 927.58984375, + "completions/mean_terminated_length": 867.30517578125, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "epoch": 0.7619047619047619, + "grad_norm": 0.011028756387531757, + "kl": 0.02560985228046775, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 86751239.0, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24397502839565277, + "rewards/thinking_verbosity_reward/mean": -0.9973180890083313, + "rewards/thinking_verbosity_reward/std": 0.23009058833122253, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2387.0, + "completions/mean_length": 816.64453125, + "completions/mean_terminated_length": 789.9012451171875, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "epoch": 0.764172335600907, + "grad_norm": 0.010820006020367146, + "kl": 0.022202198626473546, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 86994228.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.0006357431411743, + "rewards/thinking_verbosity_reward/std": 0.2231128215789795, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2991.0, + "completions/mean_length": 862.21484375, + "completions/mean_terminated_length": 827.138916015625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "epoch": 0.7664399092970522, + "grad_norm": 0.01306674163788557, + "kl": 0.025442039826884866, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 87249539.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -1.012353777885437, + "rewards/thinking_verbosity_reward/std": 0.1940152794122696, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2899.0, + "completions/mean_length": 933.2265625, + "completions/mean_terminated_length": 818.8065795898438, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "epoch": 0.7687074829931972, + "grad_norm": 0.013324134051799774, + "kl": 0.026345149613916874, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 87520173.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -0.9795743227005005, + "rewards/thinking_verbosity_reward/std": 0.23604832589626312, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2921.0, + "completions/mean_length": 820.77734375, + "completions/mean_terminated_length": 785.043701171875, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "epoch": 0.7709750566893424, + "grad_norm": 0.011408631689846516, + "kl": 0.02430082391947508, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 87761724.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -0.9276875257492065, + "rewards/thinking_verbosity_reward/std": 0.1430305391550064, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2955.0, + "completions/mean_length": 877.84375, + "completions/mean_terminated_length": 825.1840209960938, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.7732426303854876, + "grad_norm": 0.0153511306270957, + "kl": 0.024690945399925113, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 88019596.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -0.9734405279159546, + "rewards/thinking_verbosity_reward/std": 0.25609883666038513, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3048.0, + "completions/mean_length": 835.859375, + "completions/mean_terminated_length": 791.3147583007812, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.7755102040816326, + "grad_norm": 0.012589513324201107, + "kl": 0.024272996000945568, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 88266728.0, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "rewards/thinking_verbosity_reward/mean": -0.9212467670440674, + "rewards/thinking_verbosity_reward/std": 0.19993983209133148, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2664.0, + "completions/mean_length": 896.3046875, + "completions/mean_terminated_length": 817.0283813476562, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.7777777777777778, + "grad_norm": 0.012007050216197968, + "kl": 0.022282492136582732, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 88529238.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.1167476177215576, + "rewards/thinking_verbosity_reward/std": 0.2829549312591553, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2923.0, + "completions/mean_length": 912.44921875, + "completions/mean_terminated_length": 886.8419189453125, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "epoch": 0.780045351473923, + "grad_norm": 0.01236378401517868, + "kl": 0.019154070410877466, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 88795489.0, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42695629596710205, + "rewards/thinking_verbosity_reward/mean": -1.0034823417663574, + "rewards/thinking_verbosity_reward/std": 0.26699358224868774, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2781.0, + "completions/mean_length": 916.80859375, + "completions/mean_terminated_length": 899.8385620117188, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.782312925170068, + "grad_norm": 0.010480931960046291, + "kl": 0.023243877571076155, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 89063864.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -1.0152311325073242, + "rewards/thinking_verbosity_reward/std": 0.20837894082069397, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2533.0, + "completions/mean_length": 837.6171875, + "completions/mean_terminated_length": 828.85498046875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.7845804988662132, + "grad_norm": 0.014342579059302807, + "kl": 0.023303609574213624, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 89308718.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -0.925529956817627, + "rewards/thinking_verbosity_reward/std": 0.21444793045520782, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3029.0, + "completions/mean_length": 906.7890625, + "completions/mean_terminated_length": 854.8240356445312, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "epoch": 0.7868480725623582, + "grad_norm": 0.011636631563305855, + "kl": 0.022323706652969122, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 89572384.0, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.4604927599430084, + "rewards/thinking_verbosity_reward/mean": -0.986864447593689, + "rewards/thinking_verbosity_reward/std": 0.2128904014825821, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1848.0, + "completions/max_terminated_length": 1848.0, + "completions/mean_length": 815.9609375, + "completions/mean_terminated_length": 815.9609375, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "epoch": 0.7891156462585034, + "grad_norm": 0.011256031692028046, + "kl": 0.022677167784422636, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 89815718.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -0.9995848536491394, + "rewards/thinking_verbosity_reward/std": 0.1469603329896927, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2744.0, + "completions/mean_length": 928.15625, + "completions/mean_terminated_length": 894.1270141601562, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.7913832199546486, + "grad_norm": 0.013190764002501965, + "kl": 0.024346552323549986, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 90085966.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -1.034498929977417, + "rewards/thinking_verbosity_reward/std": 0.2986200451850891, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2808.0, + "completions/mean_length": 915.19140625, + "completions/mean_terminated_length": 845.6168823242188, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.7936507936507936, + "grad_norm": 0.013695375062525272, + "kl": 0.021782488794997334, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 90354335.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -0.9894850254058838, + "rewards/thinking_verbosity_reward/std": 0.2085985541343689, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2054.0, + "completions/mean_length": 830.38671875, + "completions/mean_terminated_length": 803.8063354492188, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.7959183673469388, + "grad_norm": 0.014621969312429428, + "kl": 0.02188040455803275, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 90600178.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0633437633514404, + "rewards/thinking_verbosity_reward/std": 0.154441699385643, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2460.0, + "completions/mean_length": 961.37890625, + "completions/mean_terminated_length": 953.1019897460938, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.7981859410430839, + "grad_norm": 0.010696292854845524, + "kl": 0.018464236753061414, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 90877027.0, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.4604927599430084, + "rewards/thinking_verbosity_reward/mean": -1.0135011672973633, + "rewards/thinking_verbosity_reward/std": 0.16011835634708405, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2723.0, + "completions/mean_length": 945.08984375, + "completions/mean_terminated_length": 885.2971801757812, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "epoch": 0.800453514739229, + "grad_norm": 0.011939599178731441, + "kl": 0.020967474672943354, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 91150970.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -1.0550614595413208, + "rewards/thinking_verbosity_reward/std": 0.17539499700069427, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3044.0, + "completions/mean_length": 829.7109375, + "completions/mean_terminated_length": 785.0438232421875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 0.8027210884353742, + "grad_norm": 0.014829556457698345, + "kl": 0.025762970559298992, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 91392720.0, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3333333432674408, + "rewards/thinking_verbosity_reward/mean": -1.0553654432296753, + "rewards/thinking_verbosity_reward/std": 0.2873537540435791, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2688.0, + "completions/mean_length": 987.3125, + "completions/mean_terminated_length": 928.706787109375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 0.8049886621315193, + "grad_norm": 0.011634284630417824, + "kl": 0.02299315994605422, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 91677400.0, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.4604927599430084, + "rewards/thinking_verbosity_reward/mean": -1.0839152336120605, + "rewards/thinking_verbosity_reward/std": 0.28412118554115295, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2260.0, + "completions/mean_length": 834.06640625, + "completions/mean_terminated_length": 825.2902221679688, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.8072562358276644, + "grad_norm": 0.011327235959470272, + "kl": 0.02313454425893724, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 91925537.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -1.0198473930358887, + "rewards/thinking_verbosity_reward/std": 0.18449120223522186, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2402.0, + "completions/mean_length": 851.4921875, + "completions/mean_terminated_length": 779.8628540039062, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.8095238095238095, + "grad_norm": 0.011043939739465714, + "kl": 0.023983490653336048, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 92179423.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -1.0275758504867554, + "rewards/thinking_verbosity_reward/std": 0.26768043637275696, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2918.0, + "completions/mean_length": 1017.87109375, + "completions/mean_terminated_length": 943.0242919921875, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "epoch": 0.8117913832199547, + "grad_norm": 0.010818707756698132, + "kl": 0.020515048410743475, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 92470966.0, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.467176616191864, + "rewards/thinking_verbosity_reward/mean": -1.0695469379425049, + "rewards/thinking_verbosity_reward/std": 0.17117424309253693, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2101.0, + "completions/mean_length": 906.26171875, + "completions/mean_terminated_length": 836.399169921875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "epoch": 0.8140589569160998, + "grad_norm": 0.012749510817229748, + "kl": 0.025132799288257957, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 92736881.0, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42695629596710205, + "rewards/thinking_verbosity_reward/mean": -1.081768274307251, + "rewards/thinking_verbosity_reward/std": 0.2766227722167969, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2728.0, + "completions/mean_length": 825.86328125, + "completions/mean_terminated_length": 799.2293090820312, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "epoch": 0.8163265306122449, + "grad_norm": 0.010510360822081566, + "kl": 0.021572199184447527, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 92983606.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -0.9859007596969604, + "rewards/thinking_verbosity_reward/std": 0.191152885556221, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3056.0, + "completions/mean_length": 795.078125, + "completions/mean_terminated_length": 749.7211303710938, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.81859410430839, + "grad_norm": 0.016560906544327736, + "kl": 0.0297054226975888, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 93223330.0, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.501733124256134, + "rewards/thinking_verbosity_reward/mean": -0.8862563371658325, + "rewards/thinking_verbosity_reward/std": 0.3391442596912384, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2797.0, + "completions/mean_length": 929.0078125, + "completions/mean_terminated_length": 886.3187255859375, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.8208616780045351, + "grad_norm": 0.011147328652441502, + "kl": 0.023085664259269834, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 93496708.0, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3503824472427368, + "rewards/thinking_verbosity_reward/mean": -1.0022252798080444, + "rewards/thinking_verbosity_reward/std": 0.19769883155822754, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2474.0, + "completions/mean_length": 961.1328125, + "completions/mean_terminated_length": 919.0836791992188, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "epoch": 0.8231292517006803, + "grad_norm": 0.009768502786755562, + "kl": 0.023698203498497605, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 93774686.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.0125658512115479, + "rewards/thinking_verbosity_reward/std": 0.11436259001493454, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2611.0, + "completions/mean_length": 965.53515625, + "completions/mean_terminated_length": 897.5846557617188, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.8253968253968254, + "grad_norm": 0.009819863364100456, + "kl": 0.022506618639454246, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 94057127.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -1.0088179111480713, + "rewards/thinking_verbosity_reward/std": 0.3067062199115753, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1902.0, + "completions/mean_length": 802.97265625, + "completions/mean_terminated_length": 794.0745849609375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "epoch": 0.8276643990929705, + "grad_norm": 0.01071020495146513, + "kl": 0.023376145865768194, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 94296288.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -0.9137327671051025, + "rewards/thinking_verbosity_reward/std": 0.1676597148180008, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2477.0, + "completions/mean_length": 825.4453125, + "completions/mean_terminated_length": 789.7857666015625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.8299319727891157, + "grad_norm": 0.013528002426028252, + "kl": 0.028941427590325475, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 94538722.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -1.0713086128234863, + "rewards/thinking_verbosity_reward/std": 0.2203737050294876, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3050.0, + "completions/mean_length": 857.3984375, + "completions/mean_terminated_length": 795.1405029296875, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.8321995464852607, + "grad_norm": 0.012980113737285137, + "kl": 0.02380442596040666, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 94791776.0, + "rewards/accuracy_reward/mean": 0.390625, + "rewards/accuracy_reward/std": 0.4917473793029785, + "rewards/thinking_verbosity_reward/mean": -1.0246860980987549, + "rewards/thinking_verbosity_reward/std": 0.3006916046142578, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2836.0, + "completions/mean_length": 952.45703125, + "completions/mean_terminated_length": 866.2966918945312, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.8344671201814059, + "grad_norm": 0.012682869099080563, + "kl": 0.024155371822416782, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 95065013.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -0.9649271965026855, + "rewards/thinking_verbosity_reward/std": 0.18508249521255493, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3045.0, + "completions/mean_length": 878.0, + "completions/mean_terminated_length": 843.1746215820312, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "epoch": 0.8367346938775511, + "grad_norm": 0.014257029630243778, + "kl": 0.024025337304919958, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 95321413.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -0.9753512144088745, + "rewards/thinking_verbosity_reward/std": 0.22275876998901367, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2592.0, + "completions/mean_length": 868.61328125, + "completions/mean_terminated_length": 824.7211303710938, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "epoch": 0.8390022675736961, + "grad_norm": 0.011700167320668697, + "kl": 0.025986819062381983, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 95574914.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -0.9775135517120361, + "rewards/thinking_verbosity_reward/std": 0.24155281484127045, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2871.0, + "completions/mean_length": 849.265625, + "completions/mean_terminated_length": 768.2753295898438, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.8412698412698413, + "grad_norm": 0.013137934729456902, + "kl": 0.024484576424583793, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 95823166.0, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "rewards/thinking_verbosity_reward/mean": -1.0904556512832642, + "rewards/thinking_verbosity_reward/std": 0.3829812705516815, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3057.0, + "completions/mean_length": 920.75390625, + "completions/mean_terminated_length": 824.1672973632812, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "epoch": 0.8435374149659864, + "grad_norm": 0.010617600753903389, + "kl": 0.027802387718111277, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 96093615.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -0.961787760257721, + "rewards/thinking_verbosity_reward/std": 0.20168812572956085, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3063.0, + "completions/mean_length": 883.91015625, + "completions/mean_terminated_length": 822.3975830078125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.8458049886621315, + "grad_norm": 0.012444906868040562, + "kl": 0.025895779253914952, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 96353424.0, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42695629596710205, + "rewards/thinking_verbosity_reward/mean": -1.0143842697143555, + "rewards/thinking_verbosity_reward/std": 0.1348470151424408, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2601.0, + "completions/mean_length": 819.31640625, + "completions/mean_terminated_length": 801.5787353515625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 0.8480725623582767, + "grad_norm": 0.013174058869481087, + "kl": 0.02888593915849924, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 96594665.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.3145764470100403, + "rewards/thinking_verbosity_reward/mean": -1.038906216621399, + "rewards/thinking_verbosity_reward/std": 0.19068105518817902, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2758.0, + "completions/mean_length": 862.21875, + "completions/mean_terminated_length": 818.19921875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "epoch": 0.8503401360544217, + "grad_norm": 0.012020350433886051, + "kl": 0.022775056306272745, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 96847249.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.0227077007293701, + "rewards/thinking_verbosity_reward/std": 0.13975083827972412, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2775.0, + "completions/mean_length": 948.8515625, + "completions/mean_terminated_length": 889.1646118164062, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "epoch": 0.8526077097505669, + "grad_norm": 0.014266155660152435, + "kl": 0.020077088847756386, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 97124515.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -1.00198495388031, + "rewards/thinking_verbosity_reward/std": 0.24105188250541687, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2320.0, + "completions/mean_length": 828.2734375, + "completions/mean_terminated_length": 774.4240112304688, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.854875283446712, + "grad_norm": 0.014739529229700565, + "kl": 0.02488754875957966, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 97367497.0, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.4836103618144989, + "rewards/thinking_verbosity_reward/mean": -0.9463934302330017, + "rewards/thinking_verbosity_reward/std": 0.13109609484672546, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3054.0, + "completions/mean_length": 854.3515625, + "completions/mean_terminated_length": 828.0553588867188, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 0.8571428571428571, + "grad_norm": 0.0116888964548707, + "kl": 0.02213233965449035, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 97617491.0, + "rewards/accuracy_reward/mean": 0.453125, + "rewards/accuracy_reward/std": 0.501733124256134, + "rewards/thinking_verbosity_reward/mean": -1.0228101015090942, + "rewards/thinking_verbosity_reward/std": 0.3086584806442261, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2686.0, + "completions/mean_length": 807.99609375, + "completions/mean_terminated_length": 790.1693115234375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.8594104308390023, + "grad_norm": 0.012171067297458649, + "kl": 0.025199716445058584, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 97859482.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -0.9589796662330627, + "rewards/thinking_verbosity_reward/std": 0.19631190598011017, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2825.0, + "completions/mean_length": 967.42578125, + "completions/mean_terminated_length": 845.6735229492188, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "epoch": 0.8616780045351474, + "grad_norm": 0.009905674494802952, + "kl": 0.023127440363168716, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 98139887.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -0.9449113607406616, + "rewards/thinking_verbosity_reward/std": 0.18759644031524658, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2600.0, + "completions/mean_length": 909.41796875, + "completions/mean_terminated_length": 857.5160522460938, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "epoch": 0.8639455782312925, + "grad_norm": 0.01118555199354887, + "kl": 0.024022630183026195, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 98406618.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -0.9754558801651001, + "rewards/thinking_verbosity_reward/std": 0.15726324915885925, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2934.0, + "completions/mean_length": 921.17578125, + "completions/mean_terminated_length": 860.7108154296875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.8662131519274376, + "grad_norm": 0.012446076609194279, + "kl": 0.026879936456680298, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 98676615.0, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21304203569889069, + "rewards/thinking_verbosity_reward/mean": -1.0933247804641724, + "rewards/thinking_verbosity_reward/std": 0.2685801684856415, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2918.0, + "completions/mean_length": 802.3984375, + "completions/mean_terminated_length": 784.5275268554688, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "epoch": 0.8684807256235828, + "grad_norm": 0.013311016373336315, + "kl": 0.03001387231051922, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 98914973.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -0.9257777333259583, + "rewards/thinking_verbosity_reward/std": 0.16963444650173187, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 1899.0, + "completions/mean_length": 847.53125, + "completions/mean_terminated_length": 838.8079223632812, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "epoch": 0.8707482993197279, + "grad_norm": 0.013902475126087666, + "kl": 0.025859171524643898, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 99164797.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.0471138954162598, + "rewards/thinking_verbosity_reward/std": 0.1812586635351181, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2878.0, + "completions/mean_length": 901.78125, + "completions/mean_terminated_length": 867.3333740234375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "epoch": 0.873015873015873, + "grad_norm": 0.0133685152977705, + "kl": 0.025461049983277917, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 99430485.0, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39339789748191833, + "rewards/thinking_verbosity_reward/mean": -1.0145424604415894, + "rewards/thinking_verbosity_reward/std": 0.19272588193416595, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2864.0, + "completions/mean_length": 827.71484375, + "completions/mean_terminated_length": 783.0079956054688, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.8752834467120182, + "grad_norm": 0.01594766043126583, + "kl": 0.02964222407899797, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 99673068.0, + "rewards/accuracy_reward/mean": 0.3125, + "rewards/accuracy_reward/std": 0.467176616191864, + "rewards/thinking_verbosity_reward/mean": -0.9606199264526367, + "rewards/thinking_verbosity_reward/std": 0.19574454426765442, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 811.8046875, + "completions/mean_terminated_length": 802.9412231445312, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.8775510204081632, + "grad_norm": 0.012479163706302643, + "kl": 0.0270172746386379, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 99913522.0, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24397502839565277, + "rewards/thinking_verbosity_reward/mean": -0.9852149486541748, + "rewards/thinking_verbosity_reward/std": 0.12074504047632217, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2757.0, + "completions/mean_length": 875.03515625, + "completions/mean_terminated_length": 822.3080444335938, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "epoch": 0.8798185941043084, + "grad_norm": 0.012371636927127838, + "kl": 0.02950235945172608, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 100169219.0, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42695629596710205, + "rewards/thinking_verbosity_reward/mean": -0.9594322443008423, + "rewards/thinking_verbosity_reward/std": 0.2089218944311142, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2890.0, + "completions/mean_length": 834.9296875, + "completions/mean_terminated_length": 781.2400512695312, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.8820861678004536, + "grad_norm": 0.013973907567560673, + "kl": 0.03134762542322278, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 100412577.0, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.36596253514289856, + "rewards/thinking_verbosity_reward/mean": -0.977250337600708, + "rewards/thinking_verbosity_reward/std": 0.13266722857952118, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2310.0, + "completions/mean_length": 899.3984375, + "completions/mean_terminated_length": 838.3212280273438, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.8843537414965986, + "grad_norm": 0.010782791301608086, + "kl": 0.023816830478608608, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 100680071.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -1.0231471061706543, + "rewards/thinking_verbosity_reward/std": 0.21741190552711487, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2548.0, + "completions/mean_length": 847.453125, + "completions/mean_terminated_length": 821.0751342773438, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "epoch": 0.8866213151927438, + "grad_norm": 0.013053447008132935, + "kl": 0.023653516080230474, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 100930563.0, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44515693187713623, + "rewards/thinking_verbosity_reward/mean": -0.9507926106452942, + "rewards/thinking_verbosity_reward/std": 0.14549165964126587, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2933.0, + "completions/mean_length": 927.3203125, + "completions/mean_terminated_length": 875.8480224609375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 0.8888888888888888, + "grad_norm": 0.013147743418812752, + "kl": 0.026312687434256077, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 101199677.0, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.38025420904159546, + "rewards/thinking_verbosity_reward/mean": -1.04852294921875, + "rewards/thinking_verbosity_reward/std": 0.2355634570121765, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3047.0, + "completions/mean_length": 955.30859375, + "completions/mean_terminated_length": 878.1821899414062, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "epoch": 0.891156462585034, + "grad_norm": 0.01144398096948862, + "kl": 0.021620240760967135, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 101476972.0, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.4166666865348816, + "rewards/thinking_verbosity_reward/mean": -1.1334986686706543, + "rewards/thinking_verbosity_reward/std": 0.3079391121864319, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2893.0, + "completions/mean_length": 867.46484375, + "completions/mean_terminated_length": 805.4899291992188, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 0.8934240362811792, + "grad_norm": 0.013954801484942436, + "kl": 0.024199879029765725, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 101730683.0, + "rewards/accuracy_reward/mean": 0.34375, + "rewards/accuracy_reward/std": 0.4787135720252991, + "rewards/thinking_verbosity_reward/mean": -0.9524452686309814, + "rewards/thinking_verbosity_reward/std": 0.21592432260513306, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2199.0, + "completions/mean_length": 826.875, + "completions/mean_terminated_length": 772.9920654296875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.8956916099773242, + "grad_norm": 0.014939767308533192, + "kl": 0.030061365570873022, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 101974027.0, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.27048972249031067, + "rewards/thinking_verbosity_reward/mean": -1.002968192100525, + "rewards/thinking_verbosity_reward/std": 0.2641099989414215, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2970.0, + "completions/mean_length": 884.24609375, + "completions/mean_terminated_length": 840.6653442382812, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "epoch": 0.8979591836734694, + "grad_norm": 0.012335467152297497, + "kl": 0.027110422030091286, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 102232690.0, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42695629596710205, + "rewards/thinking_verbosity_reward/mean": -1.0439600944519043, + "rewards/thinking_verbosity_reward/std": 0.2193758487701416, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2945.0, + "completions/mean_length": 845.32421875, + "completions/mean_terminated_length": 782.7268676757812, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "epoch": 0.9002267573696145, + "grad_norm": 0.014811097644269466, + "kl": 0.029066244373098016, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 102478589.0, + "rewards/accuracy_reward/mean": 0.328125, + "rewards/accuracy_reward/std": 0.4732423722743988, + "rewards/thinking_verbosity_reward/mean": -1.0271347761154175, + "rewards/thinking_verbosity_reward/std": 0.23229792714118958, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2410.0, + "completions/mean_length": 848.125, + "completions/mean_terminated_length": 812.825439453125, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.9024943310657596, + "grad_norm": 0.013779336586594582, + "kl": 0.025711732218042016, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 102727565.0, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40550529956817627, + "rewards/thinking_verbosity_reward/mean": -1.033552885055542, + "rewards/thinking_verbosity_reward/std": 0.18700554966926575, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2137.0, + "completions/mean_length": 795.30859375, + "completions/mean_terminated_length": 768.312255859375, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 0.9047619047619048, + "grad_norm": 0.014009837061166763, + "kl": 0.02319841179996729, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 102965188.0, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.4364357888698578, + "rewards/thinking_verbosity_reward/mean": -0.9752815961837769, + "rewards/thinking_verbosity_reward/std": 0.1764889508485794, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 2390.0, + "completions/mean_length": 898.0546875, + "completions/mean_terminated_length": 854.7490234375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 0.9070294784580499, + "grad_norm": 0.013971750624477863, + "kl": 0.023204547353088856, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 103226306.0, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29378482699394226, + "rewards/thinking_verbosity_reward/mean": -1.0046699047088623, + "rewards/thinking_verbosity_reward/std": 0.19317185878753662, + "step": 400 + } + ], + "logging_steps": 1, + "max_steps": 441, + "num_input_tokens_seen": 103226306, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}