diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,24043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.73724117818606, + "eval_steps": 500, + "global_step": 1600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0200892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3119.0, + "completions/mean_length": 606.5625, + "completions/mean_terminated_length": 535.0250854492188, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 0.0023330417031204435, + "grad_norm": 0.139825239777565, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 556256.0, + "reward": 0.5424107313156128, + "reward_std": 0.24488291144371033, + "rewards/verify_math_reward/mean": 0.5424107313156128, + "rewards/verify_math_reward/std": 0.4984763562679291, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0022986409749137238, + "clip_ratio/high_mean": 0.0010865736549021676, + "clip_ratio/low_mean": 0.0006070504496165086, + "clip_ratio/low_min": 3.928235310013406e-05, + "clip_ratio/region_mean": 0.0016936241154326126, + "epoch": 0.004666083406240887, + "grad_norm": 0.13357040286064148, + "learning_rate": 1e-06, + "loss": -0.0052, + "step": 2 + }, + { + "clip_ratio/high_max": 0.002638528043462429, + "clip_ratio/high_mean": 0.0011571372342586983, + "clip_ratio/low_mean": 0.0006701715410599718, + "clip_ratio/low_min": 7.16715467206086e-05, + "clip_ratio/region_mean": 0.00182730880624149, + "epoch": 0.00699912510936133, + "grad_norm": 0.12927649915218353, + "learning_rate": 1e-06, + "loss": -0.0053, + "step": 3 + }, + { + "clip_ratio/high_max": 0.002463254946633242, + "clip_ratio/high_mean": 0.0010911843601206783, + "clip_ratio/low_mean": 0.0006269629561757029, + "clip_ratio/low_min": 5.3640959777112585e-05, + "clip_ratio/region_mean": 0.0017181472649099305, + "epoch": 0.009332166812481774, + "grad_norm": 0.13325555622577667, + "learning_rate": 1e-06, + "loss": -0.0053, + "step": 4 + }, + { + "clip_ratio/high_max": 0.002511580994905671, + "clip_ratio/high_mean": 0.0010959685314446688, + "clip_ratio/low_mean": 0.0007915860005596187, + "clip_ratio/low_min": 5.303173657011939e-05, + "clip_ratio/region_mean": 0.0018875545429182239, + "completions/clipped_ratio": 0.010044642857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2607.0, + "completions/mean_length": 590.552490234375, + "completions/mean_terminated_length": 554.9841918945312, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.011665208515602217, + "grad_norm": 0.1198158785700798, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 1135783.0, + "reward": 0.4832589626312256, + "reward_std": 0.2370942384004593, + "rewards/verify_math_reward/mean": 0.4832589328289032, + "rewards/verify_math_reward/std": 0.4999987483024597, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0023428853455698118, + "clip_ratio/high_mean": 0.0009512929136690218, + "clip_ratio/low_mean": 0.0005539788216992747, + "clip_ratio/low_min": 3.876696337101748e-05, + "clip_ratio/region_mean": 0.0015052717571961693, + "epoch": 0.01399825021872266, + "grad_norm": 0.12222807109355927, + "learning_rate": 1e-06, + "loss": 0.0036, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0023241781964316033, + "clip_ratio/high_mean": 0.0010337102903577033, + "clip_ratio/low_mean": 0.0005319816227711271, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001565691884025, + "epoch": 0.016331291921843103, + "grad_norm": 0.1279895007610321, + "learning_rate": 1e-06, + "loss": 0.0036, + "step": 7 + }, + { + "clip_ratio/high_max": 0.002724015103012789, + "clip_ratio/high_mean": 0.0011034075978386682, + "clip_ratio/low_mean": 0.0006094714681239566, + "clip_ratio/low_min": 2.932166989921825e-05, + "clip_ratio/region_mean": 0.0017128790204878896, + "epoch": 0.018664333624963548, + "grad_norm": 0.12145639955997467, + "learning_rate": 1e-06, + "loss": 0.0036, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0023459133226424456, + "clip_ratio/high_mean": 0.0009634899925003992, + "clip_ratio/low_mean": 0.0006493575965578202, + "clip_ratio/low_min": 1.1285662367299665e-05, + "clip_ratio/region_mean": 0.0016128476345329545, + "completions/clipped_ratio": 0.012276785714285698, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2327.0, + "completions/mean_length": 581.685302734375, + "completions/mean_terminated_length": 538.0045166015625, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.02099737532808399, + "grad_norm": 0.11929132789373398, + "learning_rate": 1e-06, + "loss": 0.0097, + "num_tokens": 1710461.0, + "reward": 0.5290178656578064, + "reward_std": 0.21602025628089905, + "rewards/verify_math_reward/mean": 0.5290178656578064, + "rewards/verify_math_reward/std": 0.49943605065345764, + "step": 9 + }, + { + "clip_ratio/high_max": 0.00253376059117727, + "clip_ratio/high_mean": 0.0010959464925690554, + "clip_ratio/low_mean": 0.0009177182091661962, + "clip_ratio/low_min": 6.172536632220726e-05, + "clip_ratio/region_mean": 0.0020136647290200926, + "epoch": 0.023330417031204434, + "grad_norm": 0.11741513758897781, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0021746221245848574, + "clip_ratio/high_mean": 0.0009437728404009249, + "clip_ratio/low_mean": 0.0006222142146725673, + "clip_ratio/low_min": 3.810313319263514e-05, + "clip_ratio/region_mean": 0.0015659870696254075, + "epoch": 0.025663458734324875, + "grad_norm": 0.12099823355674744, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 11 + }, + { + "clip_ratio/high_max": 0.002284212321683299, + "clip_ratio/high_mean": 0.0009622946818126366, + "clip_ratio/low_mean": 0.0006222454194357852, + "clip_ratio/low_min": 3.2773188650025986e-05, + "clip_ratio/region_mean": 0.0015845401139813475, + "epoch": 0.02799650043744532, + "grad_norm": 0.12343762069940567, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 12 + }, + { + "clip_ratio/high_max": 0.002133540387148969, + "clip_ratio/high_mean": 0.0008776721952017397, + "clip_ratio/low_mean": 0.0004960685146215837, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013737407025473658, + "completions/clipped_ratio": 0.014508928571428603, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4078.0, + "completions/mean_length": 587.7221069335938, + "completions/mean_terminated_length": 536.0713500976562, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.03032954214056576, + "grad_norm": 0.1237315759062767, + "learning_rate": 1e-06, + "loss": -0.008, + "num_tokens": 2263244.0, + "reward": 0.6004464626312256, + "reward_std": 0.19208844006061554, + "rewards/verify_math_reward/mean": 0.6004464030265808, + "rewards/verify_math_reward/std": 0.49008017778396606, + "step": 13 + }, + { + "clip_ratio/high_max": 0.002390783491136972, + "clip_ratio/high_mean": 0.0009201841166941449, + "clip_ratio/low_mean": 0.0005904453855691827, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015106295177247375, + "epoch": 0.032662583843686206, + "grad_norm": 0.1241251677274704, + "learning_rate": 1e-06, + "loss": -0.0081, + "step": 14 + }, + { + "clip_ratio/high_max": 0.00265116490481887, + "clip_ratio/high_mean": 0.0010003927927755285, + "clip_ratio/low_mean": 0.0005856096286152024, + "clip_ratio/low_min": 1.5866971807554364e-05, + "clip_ratio/region_mean": 0.0015860024504945613, + "epoch": 0.03499562554680665, + "grad_norm": 0.1218414381146431, + "learning_rate": 1e-06, + "loss": -0.0081, + "step": 15 + }, + { + "clip_ratio/high_max": 0.002377994002017658, + "clip_ratio/high_mean": 0.0008695639698999003, + "clip_ratio/low_mean": 0.0005416954845713917, + "clip_ratio/low_min": 1.2747297660098411e-05, + "clip_ratio/region_mean": 0.0014112594762991648, + "epoch": 0.037328667249927096, + "grad_norm": 0.12389780580997467, + "learning_rate": 1e-06, + "loss": -0.0081, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0024106820128508843, + "clip_ratio/high_mean": 0.0009968766898964532, + "clip_ratio/low_mean": 0.0006063756100047613, + "clip_ratio/low_min": 1.5118529518076684e-05, + "clip_ratio/region_mean": 0.0016032523271860555, + "completions/clipped_ratio": 0.006696428571428603, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2943.0, + "completions/mean_length": 569.2154541015625, + "completions/mean_terminated_length": 545.4393310546875, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 0.039661708953047534, + "grad_norm": 0.12719471752643585, + "learning_rate": 1e-06, + "loss": 0.0134, + "num_tokens": 2832253.0, + "reward": 0.5446428656578064, + "reward_std": 0.2242865115404129, + "rewards/verify_math_reward/mean": 0.5446428656578064, + "rewards/verify_math_reward/std": 0.49828118085861206, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0025036680308403447, + "clip_ratio/high_mean": 0.0010271042483509518, + "clip_ratio/low_mean": 0.0006537355602631578, + "clip_ratio/low_min": 2.4149921955540776e-05, + "clip_ratio/region_mean": 0.0016808397995191626, + "epoch": 0.04199475065616798, + "grad_norm": 0.12676291167736053, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0027959811995970085, + "clip_ratio/high_mean": 0.0011196248633495998, + "clip_ratio/low_mean": 0.000717350643753889, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018369754980085418, + "epoch": 0.04432779235928842, + "grad_norm": 0.12739385664463043, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0029148588073439896, + "clip_ratio/high_mean": 0.001127249219280202, + "clip_ratio/low_mean": 0.0007854968207539059, + "clip_ratio/low_min": 6.304252019617707e-05, + "clip_ratio/region_mean": 0.0019127460473100655, + "epoch": 0.04666083406240887, + "grad_norm": 0.12382540851831436, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0019592746248235926, + "clip_ratio/high_mean": 0.0008812815285637043, + "clip_ratio/low_mean": 0.0007547873065050226, + "clip_ratio/low_min": 7.309357533813454e-05, + "clip_ratio/region_mean": 0.0016360688241547905, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3173.0, + "completions/mean_length": 622.6663208007812, + "completions/mean_terminated_length": 559.5147705078125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.048993875765529306, + "grad_norm": 0.13165442645549774, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 3415050.0, + "reward": 0.5245535969734192, + "reward_std": 0.24107471108436584, + "rewards/verify_math_reward/mean": 0.5245535969734192, + "rewards/verify_math_reward/std": 0.4996756613254547, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0024175399157684296, + "clip_ratio/high_mean": 0.0009496703860349953, + "clip_ratio/low_mean": 0.00075606488280755, + "clip_ratio/low_min": 5.7472104344924446e-05, + "clip_ratio/region_mean": 0.0017057352742995135, + "epoch": 0.05132691746864975, + "grad_norm": 0.13229161500930786, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0022421761314035393, + "clip_ratio/high_mean": 0.0009404981101397425, + "clip_ratio/low_mean": 0.0007850584315747255, + "clip_ratio/low_min": 6.583995582332136e-05, + "clip_ratio/region_mean": 0.0017255565471714363, + "epoch": 0.053659959171770195, + "grad_norm": 0.12714257836341858, + "learning_rate": 1e-06, + "loss": -0.0026, + "step": 23 + }, + { + "clip_ratio/high_max": 0.002484742122760508, + "clip_ratio/high_mean": 0.0010370668096584268, + "clip_ratio/low_mean": 0.0007963930911500938, + "clip_ratio/low_min": 6.353132994263433e-05, + "clip_ratio/region_mean": 0.0018334599080844782, + "epoch": 0.05599300087489064, + "grad_norm": 0.13083310425281525, + "learning_rate": 1e-06, + "loss": -0.0026, + "step": 24 + }, + { + "clip_ratio/high_max": 0.002489036211045459, + "clip_ratio/high_mean": 0.0010009248289861716, + "clip_ratio/low_mean": 0.0006285366598604014, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016294614688376896, + "completions/clipped_ratio": 0.012276785714285698, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4078.0, + "completions/mean_length": 580.6439819335938, + "completions/mean_terminated_length": 536.9503173828125, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.058326042578011085, + "grad_norm": 0.12958918511867523, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 3988707.0, + "reward": 0.5892857313156128, + "reward_std": 0.20095311105251312, + "rewards/verify_math_reward/mean": 0.5892857313156128, + "rewards/verify_math_reward/std": 0.49223825335502625, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0024675652748555876, + "clip_ratio/high_mean": 0.0009995967520808335, + "clip_ratio/low_mean": 0.0006661116531176958, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001665708430664381, + "epoch": 0.06065908428113152, + "grad_norm": 0.12610486149787903, + "learning_rate": 1e-06, + "loss": 0.0056, + "step": 26 + }, + { + "clip_ratio/high_max": 0.002277686129673384, + "clip_ratio/high_mean": 0.0009735188032209408, + "clip_ratio/low_mean": 0.0006767354534531478, + "clip_ratio/low_min": 3.548448148649186e-05, + "clip_ratio/region_mean": 0.001650254249398131, + "epoch": 0.06299212598425197, + "grad_norm": 0.1267174482345581, + "learning_rate": 1e-06, + "loss": 0.0057, + "step": 27 + }, + { + "clip_ratio/high_max": 0.002439399620925542, + "clip_ratio/high_mean": 0.0010170852947339881, + "clip_ratio/low_mean": 0.0007562009886896703, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001773286254319828, + "epoch": 0.06532516768737241, + "grad_norm": 0.12486453354358673, + "learning_rate": 1e-06, + "loss": 0.0055, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0021370626272982918, + "clip_ratio/high_mean": 0.0008809797818685183, + "clip_ratio/low_mean": 0.0005185944191907765, + "clip_ratio/low_min": 3.953835221182089e-05, + "clip_ratio/region_mean": 0.0013995742046972737, + "completions/clipped_ratio": 0.021205357142857095, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3560.0, + "completions/mean_length": 678.6060791015625, + "completions/mean_terminated_length": 604.5689697265625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.06765820939049286, + "grad_norm": 0.11682160198688507, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 4593306.0, + "reward": 0.5457589626312256, + "reward_std": 0.21774594485759735, + "rewards/verify_math_reward/mean": 0.5457589030265808, + "rewards/verify_math_reward/std": 0.4981797933578491, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0021843241120222956, + "clip_ratio/high_mean": 0.0009300625351897907, + "clip_ratio/low_mean": 0.0006210761221154826, + "clip_ratio/low_min": 4.4221876123629045e-05, + "clip_ratio/region_mean": 0.0015511386664002202, + "epoch": 0.0699912510936133, + "grad_norm": 0.1146954670548439, + "learning_rate": 1e-06, + "loss": 0.0045, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0024472579316352494, + "clip_ratio/high_mean": 0.0010522642951400485, + "clip_ratio/low_mean": 0.0005727457664761459, + "clip_ratio/low_min": 3.6030963201483246e-05, + "clip_ratio/region_mean": 0.001625010023417417, + "epoch": 0.07232429279673375, + "grad_norm": 0.1153184175491333, + "learning_rate": 1e-06, + "loss": 0.0044, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0019284382869955152, + "clip_ratio/high_mean": 0.000992046516330447, + "clip_ratio/low_mean": 0.0006042511531632044, + "clip_ratio/low_min": 2.522551039874088e-05, + "clip_ratio/region_mean": 0.0015962977122399025, + "epoch": 0.07465733449985419, + "grad_norm": 0.1235361248254776, + "learning_rate": 1e-06, + "loss": 0.0044, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0021176241425564513, + "clip_ratio/high_mean": 0.0009372366621391848, + "clip_ratio/low_mean": 0.0005595719358097995, + "clip_ratio/low_min": 6.470445987361018e-05, + "clip_ratio/region_mean": 0.0014968086179578677, + "completions/clipped_ratio": 0.030133928571428603, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4067.0, + "completions/mean_length": 700.318115234375, + "completions/mean_terminated_length": 594.8135986328125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.07699037620297462, + "grad_norm": 0.12378786504268646, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 5199175.0, + "reward": 0.5412946939468384, + "reward_std": 0.21545571088790894, + "rewards/verify_math_reward/mean": 0.5412946343421936, + "rewards/verify_math_reward/std": 0.49857014417648315, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0022196359786903486, + "clip_ratio/high_mean": 0.0009039962114911759, + "clip_ratio/low_mean": 0.000660695368424058, + "clip_ratio/low_min": 3.265412669861689e-05, + "clip_ratio/region_mean": 0.0015646915635443293, + "epoch": 0.07932341790609507, + "grad_norm": 0.125546395778656, + "learning_rate": 1e-06, + "loss": -0.0103, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0023568391989101656, + "clip_ratio/high_mean": 0.0009934375430020737, + "clip_ratio/low_mean": 0.0006520127171825152, + "clip_ratio/low_min": 4.448789968591882e-05, + "clip_ratio/region_mean": 0.0016454502547276206, + "epoch": 0.08165645960921551, + "grad_norm": 0.11966123431921005, + "learning_rate": 1e-06, + "loss": -0.0104, + "step": 35 + }, + { + "clip_ratio/high_max": 0.002292566980031552, + "clip_ratio/high_mean": 0.0009935368743754225, + "clip_ratio/low_mean": 0.0006416244887077482, + "clip_ratio/low_min": 7.534917676821351e-05, + "clip_ratio/region_mean": 0.0016351613521692343, + "epoch": 0.08398950131233596, + "grad_norm": 0.12310697883367538, + "learning_rate": 1e-06, + "loss": -0.0103, + "step": 36 + }, + { + "clip_ratio/high_max": 0.00229012560157571, + "clip_ratio/high_mean": 0.0009375477602588944, + "clip_ratio/low_mean": 0.0007024908227322157, + "clip_ratio/low_min": 2.9890166842960753e-05, + "clip_ratio/region_mean": 0.0016400385648012161, + "completions/clipped_ratio": 0.014508928571428603, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2962.0, + "completions/mean_length": 629.1998291015625, + "completions/mean_terminated_length": 578.15966796875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.0863225430154564, + "grad_norm": 0.12611764669418335, + "learning_rate": 1e-06, + "loss": 0.0144, + "num_tokens": 5804930.0, + "reward": 0.5290178656578064, + "reward_std": 0.23063203692436218, + "rewards/verify_math_reward/mean": 0.5290178656578064, + "rewards/verify_math_reward/std": 0.49943605065345764, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0026505103378440253, + "clip_ratio/high_mean": 0.000995517555566039, + "clip_ratio/low_mean": 0.000749046572309453, + "clip_ratio/low_min": 5.4068157623987645e-05, + "clip_ratio/region_mean": 0.0017445641424274072, + "epoch": 0.08865558471857685, + "grad_norm": 0.12353263795375824, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 38 + }, + { + "clip_ratio/high_max": 0.002375702024437487, + "clip_ratio/high_mean": 0.0009863118175417185, + "clip_ratio/low_mean": 0.0008022430338314734, + "clip_ratio/low_min": 3.6729577914229594e-05, + "clip_ratio/region_mean": 0.0017885548804770224, + "epoch": 0.09098862642169729, + "grad_norm": 0.12338662147521973, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0024909446510719135, + "clip_ratio/high_mean": 0.0010256626637783484, + "clip_ratio/low_mean": 0.000834449481772026, + "clip_ratio/low_min": 6.956895595067181e-05, + "clip_ratio/region_mean": 0.001860112141002901, + "epoch": 0.09332166812481774, + "grad_norm": 0.12385766208171844, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 40 + }, + { + "clip_ratio/high_max": 0.001583784669492161, + "clip_ratio/high_mean": 0.0005768877708760556, + "clip_ratio/low_mean": 0.0005286252780933864, + "clip_ratio/low_min": 1.4124293556960765e-05, + "clip_ratio/region_mean": 0.0011055130635213573, + "completions/clipped_ratio": 0.017857142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2063.0, + "completions/mean_length": 667.3471069335938, + "completions/mean_terminated_length": 605.0079345703125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.09565470982793818, + "grad_norm": 0.1104072779417038, + "learning_rate": 1e-06, + "loss": 0.0185, + "num_tokens": 6421681.0, + "reward": 0.5066964626312256, + "reward_std": 0.1720547378063202, + "rewards/verify_math_reward/mean": 0.5066964030265808, + "rewards/verify_math_reward/std": 0.5002344250679016, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0020037883587065153, + "clip_ratio/high_mean": 0.0006965057691559196, + "clip_ratio/low_mean": 0.0006347013313643401, + "clip_ratio/low_min": 2.8372424822009634e-05, + "clip_ratio/region_mean": 0.001331207105977228, + "epoch": 0.09798775153105861, + "grad_norm": 0.11040699481964111, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0018374437386228237, + "clip_ratio/high_mean": 0.0006500711151602445, + "clip_ratio/low_mean": 0.0005802347404824104, + "clip_ratio/low_min": 1.0425354048493318e-05, + "clip_ratio/region_mean": 0.001230305842909729, + "epoch": 0.10032079323417906, + "grad_norm": 0.10948064178228378, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0018857809591281693, + "clip_ratio/high_mean": 0.0006436912299250253, + "clip_ratio/low_mean": 0.000669299080982455, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013129903072695015, + "epoch": 0.1026538349372995, + "grad_norm": 0.1097576767206192, + "learning_rate": 1e-06, + "loss": 0.0183, + "step": 44 + }, + { + "clip_ratio/high_max": 0.001930827418618719, + "clip_ratio/high_mean": 0.0006981309613820486, + "clip_ratio/low_mean": 0.0005967152856101166, + "clip_ratio/low_min": 2.5625256967032328e-05, + "clip_ratio/region_mean": 0.001294846246310044, + "completions/clipped_ratio": 0.030133928571428603, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3901.0, + "completions/mean_length": 659.1183471679688, + "completions/mean_terminated_length": 552.333740234375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.10498687664041995, + "grad_norm": 0.11341482400894165, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 7004139.0, + "reward": 0.5401785969734192, + "reward_std": 0.15878842771053314, + "rewards/verify_math_reward/mean": 0.5401785969734192, + "rewards/verify_math_reward/std": 0.49866142868995667, + "step": 45 + }, + { + "clip_ratio/high_max": 0.002184344320994569, + "clip_ratio/high_mean": 0.000801586273155408, + "clip_ratio/low_mean": 0.0006106818127591396, + "clip_ratio/low_min": 1.4501159967039712e-05, + "clip_ratio/region_mean": 0.0014122680768196005, + "epoch": 0.10731991834354039, + "grad_norm": 0.1139429360628128, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0022679470203001983, + "clip_ratio/high_mean": 0.000745011353501468, + "clip_ratio/low_mean": 0.0006523218762595206, + "clip_ratio/low_min": 1.2773350135830697e-05, + "clip_ratio/region_mean": 0.0013973332061141264, + "epoch": 0.10965296004666084, + "grad_norm": 0.11152958869934082, + "learning_rate": 1e-06, + "loss": 0.002, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0024709537283342797, + "clip_ratio/high_mean": 0.0008462621262879111, + "clip_ratio/low_mean": 0.0006355218320095446, + "clip_ratio/low_min": 3.843788363155909e-05, + "clip_ratio/region_mean": 0.001481783951021498, + "epoch": 0.11198600174978128, + "grad_norm": 0.11159171164035797, + "learning_rate": 1e-06, + "loss": 0.002, + "step": 48 + }, + { + "clip_ratio/high_max": 0.002606435209600022, + "clip_ratio/high_mean": 0.0010781428645714186, + "clip_ratio/low_mean": 0.000551244026837594, + "clip_ratio/low_min": 8.372309730475536e-05, + "clip_ratio/region_mean": 0.0016293868975481018, + "completions/clipped_ratio": 0.016741071428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3906.0, + "completions/mean_length": 655.7846069335938, + "completions/mean_terminated_length": 597.211181640625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.11431904345290173, + "grad_norm": 0.13207431137561798, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 7618274.0, + "reward": 0.5558035969734192, + "reward_std": 0.22808194160461426, + "rewards/verify_math_reward/mean": 0.5558035969734192, + "rewards/verify_math_reward/std": 0.49715369939804077, + "step": 49 + }, + { + "clip_ratio/high_max": 0.002573193338321289, + "clip_ratio/high_mean": 0.0010621982528391527, + "clip_ratio/low_mean": 0.000621154951659264, + "clip_ratio/low_min": 4.7855231059656944e-05, + "clip_ratio/region_mean": 0.0016833531844895333, + "epoch": 0.11665208515602217, + "grad_norm": 0.13091568648815155, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 50 + }, + { + "clip_ratio/high_max": 0.002612927622976713, + "clip_ratio/high_mean": 0.0010625724717101548, + "clip_ratio/low_mean": 0.0007023778762231814, + "clip_ratio/low_min": 5.2263617362768855e-05, + "clip_ratio/region_mean": 0.0017649503679422196, + "epoch": 0.1189851268591426, + "grad_norm": 0.13183654844760895, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0026154524603043683, + "clip_ratio/high_mean": 0.001107633943320252, + "clip_ratio/low_mean": 0.00068232952071412, + "clip_ratio/low_min": 3.851271958410507e-05, + "clip_ratio/region_mean": 0.0017899634622153826, + "epoch": 0.12131816856226305, + "grad_norm": 0.12829582393169403, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 52 + }, + { + "clip_ratio/high_max": 0.002519420573662501, + "clip_ratio/high_mean": 0.0010384653105575126, + "clip_ratio/low_mean": 0.0005523251293197973, + "clip_ratio/low_min": 3.0062635232752655e-05, + "clip_ratio/region_mean": 0.0015907904598861933, + "completions/clipped_ratio": 0.016741071428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3275.0, + "completions/mean_length": 595.1596069335938, + "completions/mean_terminated_length": 535.553955078125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.12365121026538349, + "grad_norm": 0.12745356559753418, + "learning_rate": 1e-06, + "loss": -0.0042, + "num_tokens": 8186209.0, + "reward": 0.5691964626312256, + "reward_std": 0.22529542446136475, + "rewards/verify_math_reward/mean": 0.5691964030265808, + "rewards/verify_math_reward/std": 0.4954652488231659, + "step": 53 + }, + { + "clip_ratio/high_max": 0.002658939854882192, + "clip_ratio/high_mean": 0.00113034357491415, + "clip_ratio/low_mean": 0.0006279497829382308, + "clip_ratio/low_min": 6.613486675632885e-05, + "clip_ratio/region_mean": 0.0017582933578523807, + "epoch": 0.12598425196850394, + "grad_norm": 0.12348710000514984, + "learning_rate": 1e-06, + "loss": -0.0042, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0025012689584400505, + "clip_ratio/high_mean": 0.001088208387955092, + "clip_ratio/low_mean": 0.0006285139834290021, + "clip_ratio/low_min": 3.1737179597257636e-05, + "clip_ratio/region_mean": 0.0017167224104923662, + "epoch": 0.1283172936716244, + "grad_norm": 0.12301290035247803, + "learning_rate": 1e-06, + "loss": -0.0043, + "step": 55 + }, + { + "clip_ratio/high_max": 0.002401656427537091, + "clip_ratio/high_mean": 0.001099151726521086, + "clip_ratio/low_mean": 0.0007453204689227277, + "clip_ratio/low_min": 5.498792961589061e-05, + "clip_ratio/region_mean": 0.00184447223728057, + "epoch": 0.13065033537474482, + "grad_norm": 0.12256094813346863, + "learning_rate": 1e-06, + "loss": -0.0044, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0018831756169674918, + "clip_ratio/high_mean": 0.0007435029820044292, + "clip_ratio/low_mean": 0.0005309441312419949, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012744471132464241, + "completions/clipped_ratio": 0.011160714285714302, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3656.0, + "completions/mean_length": 642.0803833007812, + "completions/mean_terminated_length": 603.0971069335938, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.13298337707786526, + "grad_norm": 0.1111573651432991, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 8814017.0, + "reward": 0.5848214626312256, + "reward_std": 0.17400752007961273, + "rewards/verify_math_reward/mean": 0.5848214030265808, + "rewards/verify_math_reward/std": 0.49302801489830017, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0020343345895526, + "clip_ratio/high_mean": 0.0007646062240382889, + "clip_ratio/low_mean": 0.0006124447772890562, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013770509904134087, + "epoch": 0.13531641878098571, + "grad_norm": 0.1069447249174118, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 58 + }, + { + "clip_ratio/high_max": 0.001814778457628563, + "clip_ratio/high_mean": 0.0007152991438488243, + "clip_ratio/low_mean": 0.000650829207188508, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013661283046531025, + "epoch": 0.13764946048410615, + "grad_norm": 0.10686612129211426, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 59 + }, + { + "clip_ratio/high_max": 0.002180582654546015, + "clip_ratio/high_mean": 0.0008631766177131794, + "clip_ratio/low_mean": 0.0006756882612535264, + "clip_ratio/low_min": 1.304801662627142e-05, + "clip_ratio/region_mean": 0.001538864893518621, + "epoch": 0.1399825021872266, + "grad_norm": 0.10535623878240585, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 60 + }, + { + "clip_ratio/high_max": 0.002108367465552874, + "clip_ratio/high_mean": 0.0007626612023159396, + "clip_ratio/low_mean": 0.0004933809123031097, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012560421346279327, + "completions/clipped_ratio": 0.029017857142857095, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3878.0, + "completions/mean_length": 694.1864013671875, + "completions/mean_terminated_length": 592.5230102539062, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.14231554389034703, + "grad_norm": 0.11304501444101334, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 9420976.0, + "reward": 0.5569196939468384, + "reward_std": 0.1749839335680008, + "rewards/verify_math_reward/mean": 0.5569196343421936, + "rewards/verify_math_reward/std": 0.49702703952789307, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0021938198697171174, + "clip_ratio/high_mean": 0.0008237038055085577, + "clip_ratio/low_mean": 0.000559093690753798, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013827974908053875, + "epoch": 0.1446485855934675, + "grad_norm": 0.10986457765102386, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 62 + }, + { + "clip_ratio/high_max": 0.002056484743661713, + "clip_ratio/high_mean": 0.0008661098709126236, + "clip_ratio/low_mean": 0.0005379781496230862, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014040879941603635, + "epoch": 0.14698162729658792, + "grad_norm": 0.11047877371311188, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 63 + }, + { + "clip_ratio/high_max": 0.002259606761072064, + "clip_ratio/high_mean": 0.0008478759955323767, + "clip_ratio/low_mean": 0.0005690079124178737, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014168839370540809, + "epoch": 0.14931466899970838, + "grad_norm": 0.10816746205091476, + "learning_rate": 1e-06, + "loss": 0.0072, + "step": 64 + }, + { + "clip_ratio/high_max": 0.002090176487399731, + "clip_ratio/high_mean": 0.0008006530915736221, + "clip_ratio/low_mean": 0.000709360796463443, + "clip_ratio/low_min": 4.4256788896746e-05, + "clip_ratio/region_mean": 0.001510013888037065, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2538.0, + "completions/mean_length": 605.8460083007812, + "completions/mean_terminated_length": 578.364501953125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.15164771070282881, + "grad_norm": 0.12577134370803833, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 10020766.0, + "reward": 0.5837053656578064, + "reward_std": 0.21857714653015137, + "rewards/verify_math_reward/mean": 0.5837053656578064, + "rewards/verify_math_reward/std": 0.49321895837783813, + "step": 65 + }, + { + "clip_ratio/high_max": 0.002333878477656981, + "clip_ratio/high_mean": 0.0009288478941016365, + "clip_ratio/low_mean": 0.0007335979662457248, + "clip_ratio/low_min": 2.1956602722639218e-05, + "clip_ratio/region_mean": 0.0016624458621663507, + "epoch": 0.15398075240594924, + "grad_norm": 0.12191906571388245, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 66 + }, + { + "clip_ratio/high_max": 0.002172569580579875, + "clip_ratio/high_mean": 0.0008620889511803398, + "clip_ratio/low_mean": 0.0008169914053723915, + "clip_ratio/low_min": 3.694098995765671e-05, + "clip_ratio/region_mean": 0.00167908036019071, + "epoch": 0.1563137941090697, + "grad_norm": 0.12169674783945084, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0023263591720024124, + "clip_ratio/high_mean": 0.0009342802095488878, + "clip_ratio/low_mean": 0.0009031826884893235, + "clip_ratio/low_min": 5.15037982040667e-05, + "clip_ratio/region_mean": 0.0018374628780293278, + "epoch": 0.15864683581219013, + "grad_norm": 0.12440544366836548, + "learning_rate": 1e-06, + "loss": 0.002, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0018946796481031924, + "clip_ratio/high_mean": 0.0008617473213234916, + "clip_ratio/low_mean": 0.0006712405856887926, + "clip_ratio/low_min": 2.0611231775546912e-05, + "clip_ratio/region_mean": 0.0015329879242926836, + "completions/clipped_ratio": 0.029017857142857095, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3917.0, + "completions/mean_length": 706.7098388671875, + "completions/mean_terminated_length": 605.4207153320312, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.1609798775153106, + "grad_norm": 0.13245128095149994, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 10628714.0, + "reward": 0.559151828289032, + "reward_std": 0.23240120708942413, + "rewards/verify_math_reward/mean": 0.5591517686843872, + "rewards/verify_math_reward/std": 0.496766060590744, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0024722638409002684, + "clip_ratio/high_mean": 0.0009684698052296881, + "clip_ratio/low_mean": 0.0007125175779947313, + "clip_ratio/low_min": 3.960115100198891e-05, + "clip_ratio/region_mean": 0.0016809873850434087, + "epoch": 0.16331291921843102, + "grad_norm": 0.13081303238868713, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0022747882321709767, + "clip_ratio/high_mean": 0.0009260399037884781, + "clip_ratio/low_mean": 0.000759382968681166, + "clip_ratio/low_min": 4.311801512812963e-05, + "clip_ratio/region_mean": 0.0016854228488227818, + "epoch": 0.16564596092155148, + "grad_norm": 0.1290624439716339, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0023498402879340574, + "clip_ratio/high_mean": 0.0009991134465963114, + "clip_ratio/low_mean": 0.0007842375252948841, + "clip_ratio/low_min": 4.8880713620746974e-05, + "clip_ratio/region_mean": 0.0017833509882621001, + "epoch": 0.1679790026246719, + "grad_norm": 0.1262524425983429, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0027180942706763744, + "clip_ratio/high_mean": 0.0011511004449857865, + "clip_ratio/low_mean": 0.0006964103449718095, + "clip_ratio/low_min": 3.421556357352529e-05, + "clip_ratio/region_mean": 0.0018475107644917443, + "completions/clipped_ratio": 0.029017857142857095, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3315.0, + "completions/mean_length": 680.4319458007812, + "completions/mean_terminated_length": 578.3574829101562, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.17031204432779237, + "grad_norm": 0.13039037585258484, + "learning_rate": 1e-06, + "loss": -0.0199, + "num_tokens": 11226341.0, + "reward": 0.660714328289032, + "reward_std": 0.23322537541389465, + "rewards/verify_math_reward/mean": 0.6607142686843872, + "rewards/verify_math_reward/std": 0.4737313687801361, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0024872794601833448, + "clip_ratio/high_mean": 0.0011051116780436132, + "clip_ratio/low_mean": 0.0007465089693141636, + "clip_ratio/low_min": 6.628324354096549e-05, + "clip_ratio/region_mean": 0.0018516206691856496, + "epoch": 0.1726450860309128, + "grad_norm": 0.12996259331703186, + "learning_rate": 1e-06, + "loss": -0.0199, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0028027194930473343, + "clip_ratio/high_mean": 0.0012348731033853255, + "clip_ratio/low_mean": 0.0008350134203283233, + "clip_ratio/low_min": 6.279769513639621e-05, + "clip_ratio/region_mean": 0.0020698865337180905, + "epoch": 0.17497812773403323, + "grad_norm": 0.1261185258626938, + "learning_rate": 1e-06, + "loss": -0.0201, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0032098262090585195, + "clip_ratio/high_mean": 0.0013133431639289483, + "clip_ratio/low_mean": 0.0007700925461904262, + "clip_ratio/low_min": 4.5335311369854026e-05, + "clip_ratio/region_mean": 0.002083435691019986, + "epoch": 0.1773111694371537, + "grad_norm": 0.1239931732416153, + "learning_rate": 1e-06, + "loss": -0.0201, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0019031432893825695, + "clip_ratio/high_mean": 0.0009410108505107928, + "clip_ratio/low_mean": 0.0006787779257138027, + "clip_ratio/low_min": 5.3184265198069625e-05, + "clip_ratio/region_mean": 0.0016197887598536909, + "completions/clipped_ratio": 0.008928571428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2877.0, + "completions/mean_length": 593.59375, + "completions/mean_terminated_length": 562.04052734375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.17964421114027412, + "grad_norm": 0.1269347369670868, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 11816601.0, + "reward": 0.6495535969734192, + "reward_std": 0.21132300794124603, + "rewards/verify_math_reward/mean": 0.6495535969734192, + "rewards/verify_math_reward/std": 0.477376252412796, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0020921051764162257, + "clip_ratio/high_mean": 0.0009143478564510588, + "clip_ratio/low_mean": 0.0007781102540320717, + "clip_ratio/low_min": 4.562021877063671e-05, + "clip_ratio/region_mean": 0.0016924581068451516, + "epoch": 0.18197725284339458, + "grad_norm": 0.1262543946504593, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 78 + }, + { + "clip_ratio/high_max": 0.00232102838344872, + "clip_ratio/high_mean": 0.0010060116510430817, + "clip_ratio/low_mean": 0.0008259733658633195, + "clip_ratio/low_min": 9.44464636631892e-05, + "clip_ratio/region_mean": 0.00183198502054438, + "epoch": 0.184310294546515, + "grad_norm": 0.12525974214076996, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 79 + }, + { + "clip_ratio/high_max": 0.002366886088566389, + "clip_ratio/high_mean": 0.0010620286157063674, + "clip_ratio/low_mean": 0.0008741377005208051, + "clip_ratio/low_min": 7.721427664364455e-05, + "clip_ratio/region_mean": 0.0019361662562005222, + "epoch": 0.18664333624963547, + "grad_norm": 0.1252414882183075, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0018435443125781603, + "clip_ratio/high_mean": 0.0007656542757104035, + "clip_ratio/low_mean": 0.0005714547769457567, + "clip_ratio/low_min": 1.332338524662191e-05, + "clip_ratio/region_mean": 0.0013371090353757609, + "completions/clipped_ratio": 0.022321428571428603, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3419.0, + "completions/mean_length": 646.0457763671875, + "completions/mean_terminated_length": 567.2796630859375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.1889763779527559, + "grad_norm": 0.1250150203704834, + "learning_rate": 1e-06, + "loss": 0.0102, + "num_tokens": 12395730.0, + "reward": 0.598214328289032, + "reward_std": 0.18430186808109283, + "rewards/verify_math_reward/mean": 0.5982142686843872, + "rewards/verify_math_reward/std": 0.49053287506103516, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0019822753747575916, + "clip_ratio/high_mean": 0.0008148421766236424, + "clip_ratio/low_mean": 0.0005596559967671055, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013744982024945784, + "epoch": 0.19130941965587636, + "grad_norm": 0.12403535842895508, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 82 + }, + { + "clip_ratio/high_max": 0.001799621146346908, + "clip_ratio/high_mean": 0.0008243617266998626, + "clip_ratio/low_mean": 0.0006454474469137494, + "clip_ratio/low_min": 5.0011256462312303e-05, + "clip_ratio/region_mean": 0.001469809198169969, + "epoch": 0.1936424613589968, + "grad_norm": 0.12789423763751984, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0021637038080370985, + "clip_ratio/high_mean": 0.0009658185390435392, + "clip_ratio/low_mean": 0.000647538096018252, + "clip_ratio/low_min": 2.5005628231156152e-05, + "clip_ratio/region_mean": 0.0016133566678036004, + "epoch": 0.19597550306211722, + "grad_norm": 0.11844993382692337, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0016195920543395914, + "clip_ratio/high_mean": 0.0007305502513190731, + "clip_ratio/low_mean": 0.00042413621758896625, + "clip_ratio/low_min": 2.5902445486281067e-05, + "clip_ratio/region_mean": 0.0011546864770934917, + "completions/clipped_ratio": 0.0189732142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3058.0, + "completions/mean_length": 694.1875610351562, + "completions/mean_terminated_length": 628.3958740234375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.19830854476523768, + "grad_norm": 0.10732998698949814, + "learning_rate": 1e-06, + "loss": 0.0246, + "num_tokens": 13043522.0, + "reward": 0.5290178656578064, + "reward_std": 0.1879132241010666, + "rewards/verify_math_reward/mean": 0.5290178656578064, + "rewards/verify_math_reward/std": 0.49943605065345764, + "step": 85 + }, + { + "clip_ratio/high_max": 0.001977817406441318, + "clip_ratio/high_mean": 0.0008252691604866413, + "clip_ratio/low_mean": 0.0005512036532309139, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013764728064415976, + "epoch": 0.2006415864683581, + "grad_norm": 0.10541458427906036, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0018707476629060693, + "clip_ratio/high_mean": 0.0008347092680196511, + "clip_ratio/low_mean": 0.0005563870163314277, + "clip_ratio/low_min": 2.986857907671947e-05, + "clip_ratio/region_mean": 0.001391096335282782, + "epoch": 0.20297462817147857, + "grad_norm": 0.10650135576725006, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0019968881679233164, + "clip_ratio/high_mean": 0.000803314738732297, + "clip_ratio/low_mean": 0.0005893544366699643, + "clip_ratio/low_min": 1.4934289538359735e-05, + "clip_ratio/region_mean": 0.0013926691899541765, + "epoch": 0.205307669874599, + "grad_norm": 0.10585729777812958, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0017711450818751473, + "clip_ratio/high_mean": 0.0007248113288369495, + "clip_ratio/low_mean": 0.0005172562141524395, + "clip_ratio/low_min": 1.4178765923134051e-05, + "clip_ratio/region_mean": 0.0012420675193425268, + "completions/clipped_ratio": 0.030133928571428603, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3155.0, + "completions/mean_length": 707.794677734375, + "completions/mean_terminated_length": 602.5224609375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.20764071157771946, + "grad_norm": 0.12869536876678467, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 13651450.0, + "reward": 0.637276828289032, + "reward_std": 0.18129737675189972, + "rewards/verify_math_reward/mean": 0.6372767686843872, + "rewards/verify_math_reward/std": 0.481054425239563, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0019154321453243028, + "clip_ratio/high_mean": 0.0007747264771751361, + "clip_ratio/low_mean": 0.0005645960791298421, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001339322541753063, + "epoch": 0.2099737532808399, + "grad_norm": 0.12464048713445663, + "learning_rate": 1e-06, + "loss": -0.0065, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0019451008993200958, + "clip_ratio/high_mean": 0.0008710344573046314, + "clip_ratio/low_mean": 0.0006486630554718431, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015196975255094003, + "epoch": 0.21230679498396035, + "grad_norm": 0.12116258591413498, + "learning_rate": 1e-06, + "loss": -0.0066, + "step": 91 + }, + { + "clip_ratio/high_max": 0.002008242765441537, + "clip_ratio/high_mean": 0.0008125443891913164, + "clip_ratio/low_mean": 0.0007152098842198029, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001527754280687077, + "epoch": 0.21463983668708078, + "grad_norm": 0.11910553276538849, + "learning_rate": 1e-06, + "loss": -0.0067, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0018941184753202833, + "clip_ratio/high_mean": 0.0007535947170254076, + "clip_ratio/low_mean": 0.0004500252680372796, + "clip_ratio/low_min": 1.5907355191302486e-05, + "clip_ratio/region_mean": 0.001203619995067129, + "completions/clipped_ratio": 0.022321428571428603, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2767.0, + "completions/mean_length": 705.9219360351562, + "completions/mean_terminated_length": 628.5228271484375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.2169728783902012, + "grad_norm": 0.1125323548913002, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 14286332.0, + "reward": 0.5535714626312256, + "reward_std": 0.19163475930690765, + "rewards/verify_math_reward/mean": 0.5535714030265808, + "rewards/verify_math_reward/std": 0.4973994791507721, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0021611167867376935, + "clip_ratio/high_mean": 0.0007815511544322362, + "clip_ratio/low_mean": 0.0005014456273784162, + "clip_ratio/low_min": 1.736834747134708e-05, + "clip_ratio/region_mean": 0.0012829967672587372, + "epoch": 0.21930592009332167, + "grad_norm": 0.11015673726797104, + "learning_rate": 1e-06, + "loss": -0.0053, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0020602601434802637, + "clip_ratio/high_mean": 0.0008277097422251245, + "clip_ratio/low_mean": 0.0005118875869811745, + "clip_ratio/low_min": 2.3599455744260922e-05, + "clip_ratio/region_mean": 0.0013395973219303414, + "epoch": 0.2216389617964421, + "grad_norm": 0.11136667430400848, + "learning_rate": 1e-06, + "loss": -0.0054, + "step": 95 + }, + { + "clip_ratio/high_max": 0.002315436089702416, + "clip_ratio/high_mean": 0.0008609901287854882, + "clip_ratio/low_mean": 0.0005434525282907998, + "clip_ratio/low_min": 1.5907355191302486e-05, + "clip_ratio/region_mean": 0.0014044426607142668, + "epoch": 0.22397200349956256, + "grad_norm": 0.10752148181200027, + "learning_rate": 1e-06, + "loss": -0.0054, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0016625657699478325, + "clip_ratio/high_mean": 0.0006201890655574971, + "clip_ratio/low_mean": 0.000499386642331956, + "clip_ratio/low_min": 1.2033115126541816e-05, + "clip_ratio/region_mean": 0.0011195757197128842, + "completions/clipped_ratio": 0.0424107142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2853.0, + "completions/mean_length": 797.1551513671875, + "completions/mean_terminated_length": 651.052490234375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.226305045202683, + "grad_norm": 0.10692703723907471, + "learning_rate": 1e-06, + "loss": -0.0092, + "num_tokens": 14930167.0, + "reward": 0.5457589626312256, + "reward_std": 0.17107722163200378, + "rewards/verify_math_reward/mean": 0.5457589030265808, + "rewards/verify_math_reward/std": 0.4981797933578491, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0018763612533803098, + "clip_ratio/high_mean": 0.0007112914427125361, + "clip_ratio/low_mean": 0.0005139210170455044, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00122521246157703, + "epoch": 0.22863808690580345, + "grad_norm": 0.10555337369441986, + "learning_rate": 1e-06, + "loss": -0.0092, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0016491648311784957, + "clip_ratio/high_mean": 0.0006668064925179351, + "clip_ratio/low_mean": 0.0006113593735790346, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001278165840631118, + "epoch": 0.23097112860892388, + "grad_norm": 0.1043100655078888, + "learning_rate": 1e-06, + "loss": -0.0093, + "step": 99 + }, + { + "clip_ratio/high_max": 0.00191963902398129, + "clip_ratio/high_mean": 0.0007225982008094434, + "clip_ratio/low_mean": 0.0005822008433824521, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001304799039644422, + "epoch": 0.23330417031204434, + "grad_norm": 0.10567686706781387, + "learning_rate": 1e-06, + "loss": -0.0093, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0016583305387030123, + "clip_ratio/high_mean": 0.0007518018992414, + "clip_ratio/low_mean": 0.0007701156791881658, + "clip_ratio/low_min": 5.871514713362558e-05, + "clip_ratio/region_mean": 0.0015219175402307883, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3472.0, + "completions/mean_length": 699.763427734375, + "completions/mean_terminated_length": 606.2889404296875, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.23563721201516477, + "grad_norm": 0.13025298714637756, + "learning_rate": 1e-06, + "loss": 0.0202, + "num_tokens": 15546323.0, + "reward": 0.5245535969734192, + "reward_std": 0.21591150760650635, + "rewards/verify_math_reward/mean": 0.5245535969734192, + "rewards/verify_math_reward/std": 0.4996756911277771, + "step": 101 + }, + { + "clip_ratio/high_max": 0.002065653068711981, + "clip_ratio/high_mean": 0.0008601816971349763, + "clip_ratio/low_mean": 0.0007794047523930203, + "clip_ratio/low_min": 7.620711767231114e-05, + "clip_ratio/region_mean": 0.0016395864731748588, + "epoch": 0.2379702537182852, + "grad_norm": 0.12589409947395325, + "learning_rate": 1e-06, + "loss": 0.0201, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0020022314784000628, + "clip_ratio/high_mean": 0.0008255693137471098, + "clip_ratio/low_mean": 0.0007917809125501662, + "clip_ratio/low_min": 7.215025198092917e-05, + "clip_ratio/region_mean": 0.0016173501862795092, + "epoch": 0.24030329542140566, + "grad_norm": 0.1242484375834465, + "learning_rate": 1e-06, + "loss": 0.02, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0019035753612115514, + "clip_ratio/high_mean": 0.0009137180004472611, + "clip_ratio/low_mean": 0.0008719772231415845, + "clip_ratio/low_min": 0.00010575910619081696, + "clip_ratio/region_mean": 0.0017856952326837927, + "epoch": 0.2426363371245261, + "grad_norm": 0.1210545226931572, + "learning_rate": 1e-06, + "loss": 0.0199, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0022454316713265143, + "clip_ratio/high_mean": 0.000870076099090511, + "clip_ratio/low_mean": 0.0005405405645433348, + "clip_ratio/low_min": 1.1241007086937316e-05, + "clip_ratio/region_mean": 0.0014106166454439517, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3474.0, + "completions/mean_length": 702.9375610351562, + "completions/mean_terminated_length": 593.48388671875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.24496937882764655, + "grad_norm": 0.13943475484848022, + "learning_rate": 1e-06, + "loss": -0.0111, + "num_tokens": 16149587.0, + "reward": 0.582589328289032, + "reward_std": 0.20993182063102722, + "rewards/verify_math_reward/mean": 0.5825892686843872, + "rewards/verify_math_reward/std": 0.4934072494506836, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0022179896768648177, + "clip_ratio/high_mean": 0.0009223711076629115, + "clip_ratio/low_mean": 0.0006832732597104041, + "clip_ratio/low_min": 7.951450606924482e-05, + "clip_ratio/region_mean": 0.0016056443710112944, + "epoch": 0.24730242053076698, + "grad_norm": 0.1324169635772705, + "learning_rate": 1e-06, + "loss": -0.0113, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0023009943761280738, + "clip_ratio/high_mean": 0.00090966447169194, + "clip_ratio/low_mean": 0.0006601435015909374, + "clip_ratio/low_min": 4.010021802969277e-05, + "clip_ratio/region_mean": 0.001569807980558835, + "epoch": 0.24963546223388744, + "grad_norm": 0.13063472509384155, + "learning_rate": 1e-06, + "loss": -0.0114, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0026666615303838626, + "clip_ratio/high_mean": 0.0010358955296396744, + "clip_ratio/low_mean": 0.0007489463259844342, + "clip_ratio/low_min": 3.096954242209904e-05, + "clip_ratio/region_mean": 0.0017848418283392675, + "epoch": 0.25196850393700787, + "grad_norm": 0.12885810434818268, + "learning_rate": 1e-06, + "loss": -0.0115, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0020536949050438125, + "clip_ratio/high_mean": 0.0008293388964375481, + "clip_ratio/low_mean": 0.0006642993957939325, + "clip_ratio/low_min": 5.731327473768033e-05, + "clip_ratio/region_mean": 0.00149363829405047, + "completions/clipped_ratio": 0.016741071428571397, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3800.0, + "completions/mean_length": 657.0457763671875, + "completions/mean_terminated_length": 598.4937744140625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.2543015456401283, + "grad_norm": 0.13001294434070587, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 16766852.0, + "reward": 0.5714285969734192, + "reward_std": 0.18979185819625854, + "rewards/verify_math_reward/mean": 0.5714285969734192, + "rewards/verify_math_reward/std": 0.49514803290367126, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0018642976283445023, + "clip_ratio/high_mean": 0.000758285681513371, + "clip_ratio/low_mean": 0.0006911922246217728, + "clip_ratio/low_min": 6.702373775624437e-05, + "clip_ratio/region_mean": 0.0014494779243250377, + "epoch": 0.2566345873432488, + "grad_norm": 0.1258811205625534, + "learning_rate": 1e-06, + "loss": 0.0057, + "step": 110 + }, + { + "clip_ratio/high_max": 0.002164812249247916, + "clip_ratio/high_mean": 0.0008751272252993658, + "clip_ratio/low_mean": 0.0007665609100513393, + "clip_ratio/low_min": 5.334565867087804e-05, + "clip_ratio/region_mean": 0.001641688148083631, + "epoch": 0.2589676290463692, + "grad_norm": 0.12354160845279694, + "learning_rate": 1e-06, + "loss": 0.0057, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0021916895493632182, + "clip_ratio/high_mean": 0.0008869685170793673, + "clip_ratio/low_mean": 0.0008263004147011088, + "clip_ratio/low_min": 6.131671671028016e-05, + "clip_ratio/region_mean": 0.0017132688881247304, + "epoch": 0.26130067074948965, + "grad_norm": 0.12259406596422195, + "learning_rate": 1e-06, + "loss": 0.0055, + "step": 112 + }, + { + "clip_ratio/high_max": 0.001697391977359075, + "clip_ratio/high_mean": 0.0006851519665360684, + "clip_ratio/low_mean": 0.0006891176126373466, + "clip_ratio/low_min": 7.1352720624418e-05, + "clip_ratio/region_mean": 0.0013742695373366587, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3655.0, + "completions/mean_length": 711.0535888671875, + "completions/mean_terminated_length": 617.889892578125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.2636337124526101, + "grad_norm": 0.12998297810554504, + "learning_rate": 1e-06, + "loss": -0.0156, + "num_tokens": 17394636.0, + "reward": 0.5223214626312256, + "reward_std": 0.20102617144584656, + "rewards/verify_math_reward/mean": 0.5223214030265808, + "rewards/verify_math_reward/std": 0.49978047609329224, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0017535710612719413, + "clip_ratio/high_mean": 0.0006981436126807239, + "clip_ratio/low_mean": 0.0008300278659589821, + "clip_ratio/low_min": 0.0001619695585759473, + "clip_ratio/region_mean": 0.0015281714950106107, + "epoch": 0.2659667541557305, + "grad_norm": 0.12897886335849762, + "learning_rate": 1e-06, + "loss": -0.0158, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0020468809198064264, + "clip_ratio/high_mean": 0.0008466970393783413, + "clip_ratio/low_mean": 0.0008826452158245957, + "clip_ratio/low_min": 0.00015975156929926015, + "clip_ratio/region_mean": 0.0017293422424700111, + "epoch": 0.268299795858851, + "grad_norm": 0.12687930464744568, + "learning_rate": 1e-06, + "loss": -0.0158, + "step": 115 + }, + { + "clip_ratio/high_max": 0.002170760475564748, + "clip_ratio/high_mean": 0.0008199298881663708, + "clip_ratio/low_mean": 0.000861517079101759, + "clip_ratio/low_min": 0.00012403995697241044, + "clip_ratio/region_mean": 0.001681446927250363, + "epoch": 0.27063283756197143, + "grad_norm": 0.12233294546604156, + "learning_rate": 1e-06, + "loss": -0.0159, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0023675355914747342, + "clip_ratio/high_mean": 0.0009463731294090394, + "clip_ratio/low_mean": 0.0006463080644607544, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001592681190231815, + "completions/clipped_ratio": 0.0189732142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3038.0, + "completions/mean_length": 641.9732666015625, + "completions/mean_terminated_length": 575.1717529296875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.27296587926509186, + "grad_norm": 0.1370822638273239, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 17995540.0, + "reward": 0.5859375, + "reward_std": 0.2263127714395523, + "rewards/verify_math_reward/mean": 0.5859375, + "rewards/verify_math_reward/std": 0.4928344786167145, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0023808866244507954, + "clip_ratio/high_mean": 0.0010071201832033694, + "clip_ratio/low_mean": 0.0006476468661276158, + "clip_ratio/low_min": 1.4744043255632278e-05, + "clip_ratio/region_mean": 0.0016547670675208792, + "epoch": 0.2752989209682123, + "grad_norm": 0.13685859739780426, + "learning_rate": 1e-06, + "loss": 0.0036, + "step": 118 + }, + { + "clip_ratio/high_max": 0.002364542982832063, + "clip_ratio/high_mean": 0.0010847648300114088, + "clip_ratio/low_mean": 0.0007723511116637383, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018571159744169563, + "epoch": 0.2776319626713328, + "grad_norm": 0.13253997266292572, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 119 + }, + { + "clip_ratio/high_max": 0.002691040266654454, + "clip_ratio/high_mean": 0.0010794019362947438, + "clip_ratio/low_mean": 0.0008650881063658744, + "clip_ratio/low_min": 4.414796330820536e-05, + "clip_ratio/region_mean": 0.0019444899953668937, + "epoch": 0.2799650043744532, + "grad_norm": 0.13256323337554932, + "learning_rate": 1e-06, + "loss": 0.0033, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0018208038454758935, + "clip_ratio/high_mean": 0.0007685111413593404, + "clip_ratio/low_mean": 0.0005416209351096768, + "clip_ratio/low_min": 5.682390292349737e-05, + "clip_ratio/region_mean": 0.0013101320873829536, + "completions/clipped_ratio": 0.0323660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3962.0, + "completions/mean_length": 776.3292846679688, + "completions/mean_terminated_length": 665.2906494140625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.28229804607757364, + "grad_norm": 0.11596595495939255, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 18657227.0, + "reward": 0.5613839626312256, + "reward_std": 0.20169946551322937, + "rewards/verify_math_reward/mean": 0.5613839030265808, + "rewards/verify_math_reward/std": 0.496494859457016, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0019638911999209085, + "clip_ratio/high_mean": 0.0007903282944425882, + "clip_ratio/low_mean": 0.0006332460270641604, + "clip_ratio/low_min": 4.578135758492863e-05, + "clip_ratio/region_mean": 0.0014235743146855384, + "epoch": 0.28463108778069407, + "grad_norm": 0.11225331574678421, + "learning_rate": 1e-06, + "loss": -0.003, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0020143422298133373, + "clip_ratio/high_mean": 0.0007923571538412943, + "clip_ratio/low_mean": 0.0006825453601777554, + "clip_ratio/low_min": 6.15502985965577e-05, + "clip_ratio/region_mean": 0.0014749025249329861, + "epoch": 0.2869641294838145, + "grad_norm": 0.11058894544839859, + "learning_rate": 1e-06, + "loss": -0.003, + "step": 123 + }, + { + "clip_ratio/high_max": 0.00207445933483541, + "clip_ratio/high_mean": 0.0008512227577739395, + "clip_ratio/low_mean": 0.0006609621759707807, + "clip_ratio/low_min": 5.902328211959684e-05, + "clip_ratio/region_mean": 0.0015121849428396672, + "epoch": 0.289297171186935, + "grad_norm": 0.11066972464323044, + "learning_rate": 1e-06, + "loss": -0.0031, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0021815118452650495, + "clip_ratio/high_mean": 0.0008497404996887781, + "clip_ratio/low_mean": 0.0004772272868649452, + "clip_ratio/low_min": 2.0509113710431848e-05, + "clip_ratio/region_mean": 0.001326967823843006, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2720.0, + "completions/mean_length": 705.1797485351562, + "completions/mean_terminated_length": 595.7984008789062, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.2916302128900554, + "grad_norm": 0.13661257922649384, + "learning_rate": 1e-06, + "loss": 0.0138, + "num_tokens": 19265220.0, + "reward": 0.609375, + "reward_std": 0.18528781831264496, + "rewards/verify_math_reward/mean": 0.609375, + "rewards/verify_math_reward/std": 0.48816296458244324, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0021811188526044134, + "clip_ratio/high_mean": 0.0008807081148916041, + "clip_ratio/low_mean": 0.0005780017290817341, + "clip_ratio/low_min": 2.8875028874608688e-05, + "clip_ratio/region_mean": 0.0014587098012270872, + "epoch": 0.29396325459317585, + "grad_norm": 0.1319681853055954, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 126 + }, + { + "clip_ratio/high_max": 0.002105122650391422, + "clip_ratio/high_mean": 0.0008940856241679285, + "clip_ratio/low_mean": 0.0006709501121804351, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015650357236154377, + "epoch": 0.2962962962962963, + "grad_norm": 0.14205984771251678, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0024559525918448344, + "clip_ratio/high_mean": 0.0009362378386867931, + "clip_ratio/low_mean": 0.0007356516161962645, + "clip_ratio/low_min": 3.0419239919865504e-05, + "clip_ratio/region_mean": 0.0016718894694349729, + "epoch": 0.29862933799941677, + "grad_norm": 0.13000363111495972, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0019030739968002308, + "clip_ratio/high_mean": 0.0006965993607082055, + "clip_ratio/low_mean": 0.0006350407384161372, + "clip_ratio/low_min": 7.164487760746852e-05, + "clip_ratio/region_mean": 0.0013316400873009115, + "completions/clipped_ratio": 0.021205357142857095, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3387.0, + "completions/mean_length": 629.5814819335938, + "completions/mean_terminated_length": 554.4822998046875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.3009623797025372, + "grad_norm": 0.13182252645492554, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 19838037.0, + "reward": 0.5691964626312256, + "reward_std": 0.16630510985851288, + "rewards/verify_math_reward/mean": 0.5691964030265808, + "rewards/verify_math_reward/std": 0.4954652488231659, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0019440464311628602, + "clip_ratio/high_mean": 0.0007428180806527962, + "clip_ratio/low_mean": 0.0006763731698811171, + "clip_ratio/low_min": 4.770672876475146e-05, + "clip_ratio/region_mean": 0.0014191912378009874, + "epoch": 0.30329542140565763, + "grad_norm": 0.12787501513957977, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0021150591092009563, + "clip_ratio/high_mean": 0.0008707102069820394, + "clip_ratio/low_mean": 0.0007152069083531387, + "clip_ratio/low_min": 4.3578196709859185e-05, + "clip_ratio/region_mean": 0.0015859171362535562, + "epoch": 0.30562846310877806, + "grad_norm": 0.12346763163805008, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 131 + }, + { + "clip_ratio/high_max": 0.002172308690205682, + "clip_ratio/high_mean": 0.0008703252351551782, + "clip_ratio/low_mean": 0.0008561224449294969, + "clip_ratio/low_min": 6.992353792156791e-05, + "clip_ratio/region_mean": 0.0017264476628042758, + "epoch": 0.3079615048118985, + "grad_norm": 0.12661373615264893, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0016295224777422845, + "clip_ratio/high_mean": 0.0006557831056852592, + "clip_ratio/low_mean": 0.0005799791524623288, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012357622545096092, + "completions/clipped_ratio": 0.0189732142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3400.0, + "completions/mean_length": 681.9006958007812, + "completions/mean_terminated_length": 615.8713989257812, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.310294546515019, + "grad_norm": 0.12366899102926254, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 20462780.0, + "reward": 0.5725446939468384, + "reward_std": 0.18144823610782623, + "rewards/verify_math_reward/mean": 0.5725446343421936, + "rewards/verify_math_reward/std": 0.49498558044433594, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0019445515772531508, + "clip_ratio/high_mean": 0.0007925075506136636, + "clip_ratio/low_mean": 0.0006260709888010751, + "clip_ratio/low_min": 2.2393407562049106e-05, + "clip_ratio/region_mean": 0.0014185785184963606, + "epoch": 0.3126275882181394, + "grad_norm": 0.12053171545267105, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 134 + }, + { + "clip_ratio/high_max": 0.002057051740848692, + "clip_ratio/high_mean": 0.000796252378677309, + "clip_ratio/low_mean": 0.0006900128737470368, + "clip_ratio/low_min": 2.3946360670379363e-05, + "clip_ratio/region_mean": 0.0014862652569718193, + "epoch": 0.31496062992125984, + "grad_norm": 0.11586639285087585, + "learning_rate": 1e-06, + "loss": 0.003, + "step": 135 + }, + { + "clip_ratio/high_max": 0.00205574548090226, + "clip_ratio/high_mean": 0.0008225631409004563, + "clip_ratio/low_mean": 0.0007366543359239586, + "clip_ratio/low_min": 2.2393407562049106e-05, + "clip_ratio/region_mean": 0.0015592174604535103, + "epoch": 0.31729367162438027, + "grad_norm": 0.11688009649515152, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 136 + }, + { + "clip_ratio/high_max": 0.002024527595494874, + "clip_ratio/high_mean": 0.0008082161057245685, + "clip_ratio/low_mean": 0.0005800189078399853, + "clip_ratio/low_min": 1.8047934645437635e-05, + "clip_ratio/region_mean": 0.0013882350067433435, + "completions/clipped_ratio": 0.0323660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3794.0, + "completions/mean_length": 740.0647583007812, + "completions/mean_terminated_length": 627.8131103515625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.31962671332750076, + "grad_norm": 0.13833001255989075, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 21090750.0, + "reward": 0.6171875, + "reward_std": 0.19227458536624908, + "rewards/verify_math_reward/mean": 0.6171875, + "rewards/verify_math_reward/std": 0.4863446056842804, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0022252232884056866, + "clip_ratio/high_mean": 0.0008493184159306111, + "clip_ratio/low_mean": 0.0006523041329273838, + "clip_ratio/low_min": 2.575105281721335e-05, + "clip_ratio/region_mean": 0.0015016225625004154, + "epoch": 0.3219597550306212, + "grad_norm": 0.1322672963142395, + "learning_rate": 1e-06, + "loss": -0.0079, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0025541731010889634, + "clip_ratio/high_mean": 0.0009663519540481502, + "clip_ratio/low_mean": 0.0007721306756138802, + "clip_ratio/low_min": 4.925636767438846e-05, + "clip_ratio/region_mean": 0.0017384826496709138, + "epoch": 0.3242927967337416, + "grad_norm": 0.13279646635055542, + "learning_rate": 1e-06, + "loss": -0.0081, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0024515801342204213, + "clip_ratio/high_mean": 0.0009114635831792839, + "clip_ratio/low_mean": 0.0007970291981109767, + "clip_ratio/low_min": 8.409956171817612e-05, + "clip_ratio/region_mean": 0.0017084927821997553, + "epoch": 0.32662583843686205, + "grad_norm": 0.13724446296691895, + "learning_rate": 1e-06, + "loss": -0.0081, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0021759426381322555, + "clip_ratio/high_mean": 0.0009506453116046032, + "clip_ratio/low_mean": 0.0007584911727462895, + "clip_ratio/low_min": 6.67031317789224e-05, + "clip_ratio/region_mean": 0.0017091364570660517, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3260.0, + "completions/mean_length": 750.388427734375, + "completions/mean_terminated_length": 642.4654541015625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "epoch": 0.3289588801399825, + "grad_norm": 0.14134296774864197, + "learning_rate": 1e-06, + "loss": -0.0084, + "num_tokens": 21741234.0, + "reward": 0.5714285969734192, + "reward_std": 0.2296549528837204, + "rewards/verify_math_reward/mean": 0.5714285969734192, + "rewards/verify_math_reward/std": 0.49514803290367126, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0024316191629623063, + "clip_ratio/high_mean": 0.0009980091144825565, + "clip_ratio/low_mean": 0.0008398861700698035, + "clip_ratio/low_min": 2.4851598027453292e-05, + "clip_ratio/region_mean": 0.0018378952809143811, + "epoch": 0.33129192184310297, + "grad_norm": 0.1454172432422638, + "learning_rate": 1e-06, + "loss": -0.0085, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0026167565447394736, + "clip_ratio/high_mean": 0.0010993376454280224, + "clip_ratio/low_mean": 0.0009133251132880105, + "clip_ratio/low_min": 9.388228863826953e-05, + "clip_ratio/region_mean": 0.002012662727793213, + "epoch": 0.3336249635462234, + "grad_norm": 0.13567112386226654, + "learning_rate": 1e-06, + "loss": -0.0086, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0025247305602533743, + "clip_ratio/high_mean": 0.001162609776656609, + "clip_ratio/low_mean": 0.0009580292135069612, + "clip_ratio/low_min": 2.1999296222929843e-05, + "clip_ratio/region_mean": 0.002120639015629422, + "epoch": 0.3359580052493438, + "grad_norm": 0.1326664388179779, + "learning_rate": 1e-06, + "loss": -0.0087, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0018576670699985698, + "clip_ratio/high_mean": 0.0008139792898873566, + "clip_ratio/low_mean": 0.000670039120450383, + "clip_ratio/low_min": 2.8118673071730882e-05, + "clip_ratio/region_mean": 0.0014840184157947078, + "completions/clipped_ratio": 0.0267857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2907.0, + "completions/mean_length": 681.7734375, + "completions/mean_terminated_length": 587.8038940429688, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.33829104695246426, + "grad_norm": 0.14912009239196777, + "learning_rate": 1e-06, + "loss": -0.007, + "num_tokens": 22330527.0, + "reward": 0.5758928656578064, + "reward_std": 0.21725811064243317, + "rewards/verify_math_reward/mean": 0.5758928656578064, + "rewards/verify_math_reward/std": 0.49448272585868835, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0025272179118474014, + "clip_ratio/high_mean": 0.0010477910109329969, + "clip_ratio/low_mean": 0.0007653703396499623, + "clip_ratio/low_min": 5.663632146024611e-05, + "clip_ratio/region_mean": 0.0018131613687728532, + "epoch": 0.34062408865558474, + "grad_norm": 0.14827348291873932, + "learning_rate": 1e-06, + "loss": -0.0072, + "step": 146 + }, + { + "clip_ratio/high_max": 0.002495536347851157, + "clip_ratio/high_mean": 0.0010808328224811703, + "clip_ratio/low_mean": 0.0009262939474865561, + "clip_ratio/low_min": 3.505966014927253e-05, + "clip_ratio/region_mean": 0.0020071267208550125, + "epoch": 0.3429571303587052, + "grad_norm": 0.1410999745130539, + "learning_rate": 1e-06, + "loss": -0.0073, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0025222120893886313, + "clip_ratio/high_mean": 0.0010745862746261992, + "clip_ratio/low_mean": 0.0010007284636230906, + "clip_ratio/low_min": 9.22423450901988e-05, + "clip_ratio/region_mean": 0.0020753147182404064, + "epoch": 0.3452901720618256, + "grad_norm": 0.13900841772556305, + "learning_rate": 1e-06, + "loss": -0.0073, + "step": 148 + }, + { + "clip_ratio/high_max": 0.001856691567809321, + "clip_ratio/high_mean": 0.0007320679142139852, + "clip_ratio/low_mean": 0.00043941574040218256, + "clip_ratio/low_min": 1.2037750821036752e-05, + "clip_ratio/region_mean": 0.001171483687357977, + "completions/clipped_ratio": 0.044642857142857095, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2857.0, + "completions/mean_length": 756.950927734375, + "completions/mean_terminated_length": 600.9205322265625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.34762321376494604, + "grad_norm": 0.131646066904068, + "learning_rate": 1e-06, + "loss": -0.0145, + "num_tokens": 22934251.0, + "reward": 0.5881696939468384, + "reward_std": 0.1741565614938736, + "rewards/verify_math_reward/mean": 0.5881696343421936, + "rewards/verify_math_reward/std": 0.4924395978450775, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0019837734871543944, + "clip_ratio/high_mean": 0.000780138074333081, + "clip_ratio/low_mean": 0.0005641352909151465, + "clip_ratio/low_min": 1.2037750821036752e-05, + "clip_ratio/region_mean": 0.0013442733616102487, + "epoch": 0.34995625546806647, + "grad_norm": 0.1266884207725525, + "learning_rate": 1e-06, + "loss": -0.0147, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0021597502782242373, + "clip_ratio/high_mean": 0.0008289266588690225, + "clip_ratio/low_mean": 0.0006222497831913643, + "clip_ratio/low_min": 1.230072848557029e-05, + "clip_ratio/region_mean": 0.0014511764529743232, + "epoch": 0.35228929717118695, + "grad_norm": 0.12174595147371292, + "learning_rate": 1e-06, + "loss": -0.0148, + "step": 151 + }, + { + "clip_ratio/high_max": 0.002154038018488791, + "clip_ratio/high_mean": 0.0008754699047130998, + "clip_ratio/low_mean": 0.0007365460114669986, + "clip_ratio/low_min": 2.460145697114058e-05, + "clip_ratio/region_mean": 0.00161201594164595, + "epoch": 0.3546223388743074, + "grad_norm": 0.12020301818847656, + "learning_rate": 1e-06, + "loss": -0.0148, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0022428504744311795, + "clip_ratio/high_mean": 0.0008285467338282615, + "clip_ratio/low_mean": 0.0007076413589857111, + "clip_ratio/low_min": 1.1980065210082103e-05, + "clip_ratio/region_mean": 0.0015361880941782147, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3907.0, + "completions/mean_length": 724.7801513671875, + "completions/mean_terminated_length": 599.920166015625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.3569553805774278, + "grad_norm": 0.1613074690103531, + "learning_rate": 1e-06, + "loss": -0.016, + "num_tokens": 23548446.0, + "reward": 0.5613839626312256, + "reward_std": 0.19839511811733246, + "rewards/verify_math_reward/mean": 0.5613839030265808, + "rewards/verify_math_reward/std": 0.496494859457016, + "step": 153 + }, + { + "clip_ratio/high_max": 0.002339055143238511, + "clip_ratio/high_mean": 0.0009058997620741138, + "clip_ratio/low_mean": 0.0006424031689675758, + "clip_ratio/low_min": 4.490345145313768e-05, + "clip_ratio/region_mean": 0.001548302905575838, + "epoch": 0.35928842228054825, + "grad_norm": 0.15127702057361603, + "learning_rate": 1e-06, + "loss": -0.0161, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0029553086278610863, + "clip_ratio/high_mean": 0.0010807084327097982, + "clip_ratio/low_mean": 0.0008939045674196677, + "clip_ratio/low_min": 3.684923558466835e-05, + "clip_ratio/region_mean": 0.001974613027414307, + "epoch": 0.36162146398366873, + "grad_norm": 0.1411028504371643, + "learning_rate": 1e-06, + "loss": -0.0163, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0028567789049702697, + "clip_ratio/high_mean": 0.0010759717879409436, + "clip_ratio/low_mean": 0.0010138653688045451, + "clip_ratio/low_min": 8.965483993961243e-05, + "clip_ratio/region_mean": 0.002089837238600012, + "epoch": 0.36395450568678916, + "grad_norm": 0.13891728222370148, + "learning_rate": 1e-06, + "loss": -0.0164, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0019052833595196716, + "clip_ratio/high_mean": 0.0007558979359600926, + "clip_ratio/low_mean": 0.0005618926061288221, + "clip_ratio/low_min": 1.3661202501680236e-05, + "clip_ratio/region_mean": 0.0013177905493648723, + "completions/clipped_ratio": 0.030133928571428603, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3188.0, + "completions/mean_length": 712.7745971679688, + "completions/mean_terminated_length": 607.6571044921875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.3662875473899096, + "grad_norm": 0.1499905288219452, + "learning_rate": 1e-06, + "loss": -0.0182, + "num_tokens": 24178852.0, + "reward": 0.5625, + "reward_std": 0.18693754076957703, + "rewards/verify_math_reward/mean": 0.5625, + "rewards/verify_math_reward/std": 0.49635544419288635, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0022266593587119132, + "clip_ratio/high_mean": 0.0008535192591807572, + "clip_ratio/low_mean": 0.0005987178856230457, + "clip_ratio/low_min": 7.914327306934865e-05, + "clip_ratio/region_mean": 0.0014522371420753188, + "epoch": 0.36862058909303, + "grad_norm": 0.14097832143306732, + "learning_rate": 1e-06, + "loss": -0.0184, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0020058465452166274, + "clip_ratio/high_mean": 0.0008499585565004963, + "clip_ratio/low_mean": 0.0007910596796136815, + "clip_ratio/low_min": 9.284633597417269e-05, + "clip_ratio/region_mean": 0.001641018214286305, + "epoch": 0.37095363079615046, + "grad_norm": 0.13096092641353607, + "learning_rate": 1e-06, + "loss": -0.0185, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0022000707976985723, + "clip_ratio/high_mean": 0.0009491592900303658, + "clip_ratio/low_mean": 0.0007415883028443204, + "clip_ratio/low_min": 8.971292754722526e-05, + "clip_ratio/region_mean": 0.0016907475728658028, + "epoch": 0.37328667249927094, + "grad_norm": 0.13571088016033173, + "learning_rate": 1e-06, + "loss": -0.0186, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0017179410206153989, + "clip_ratio/high_mean": 0.0006355128489303752, + "clip_ratio/low_mean": 0.0005784768000012264, + "clip_ratio/low_min": 2.933319137810031e-05, + "clip_ratio/region_mean": 0.0012139896389271598, + "completions/clipped_ratio": 0.0513392857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3016.0, + "completions/mean_length": 831.2489013671875, + "completions/mean_terminated_length": 654.5682373046875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 0.3756197142023914, + "grad_norm": 0.13254326581954956, + "learning_rate": 1e-06, + "loss": -0.0197, + "num_tokens": 24837019.0, + "reward": 0.5212053656578064, + "reward_std": 0.1868615597486496, + "rewards/verify_math_reward/mean": 0.5212053656578064, + "rewards/verify_math_reward/std": 0.49982914328575134, + "step": 161 + }, + { + "clip_ratio/high_max": 0.002329044058569707, + "clip_ratio/high_mean": 0.0008827270321489777, + "clip_ratio/low_mean": 0.0006735311708325753, + "clip_ratio/low_min": 1.187310044770129e-05, + "clip_ratio/region_mean": 0.0015562582266284153, + "epoch": 0.3779527559055118, + "grad_norm": 0.1271568387746811, + "learning_rate": 1e-06, + "loss": -0.0199, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0021151488945179153, + "clip_ratio/high_mean": 0.0008055039343162207, + "clip_ratio/low_mean": 0.0008194172060029814, + "clip_ratio/low_min": 6.695355841657147e-05, + "clip_ratio/region_mean": 0.0016249211257672869, + "epoch": 0.38028579760863224, + "grad_norm": 0.12323304265737534, + "learning_rate": 1e-06, + "loss": -0.02, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0023609396230312996, + "clip_ratio/high_mean": 0.0008703189359948738, + "clip_ratio/low_mean": 0.0008447565996902995, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017150755229522474, + "epoch": 0.3826188393117527, + "grad_norm": 0.11949825286865234, + "learning_rate": 1e-06, + "loss": -0.02, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0016120658874569926, + "clip_ratio/high_mean": 0.0006476519338320941, + "clip_ratio/low_mean": 0.0004724546288343845, + "clip_ratio/low_min": 2.183024844271131e-05, + "clip_ratio/region_mean": 0.0011201065572095104, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3809.0, + "completions/mean_length": 746.591552734375, + "completions/mean_terminated_length": 622.5393676757812, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.38495188101487315, + "grad_norm": 0.1378088742494583, + "learning_rate": 1e-06, + "loss": -0.0108, + "num_tokens": 25466093.0, + "reward": 0.53125, + "reward_std": 0.17765209078788757, + "rewards/verify_math_reward/mean": 0.53125, + "rewards/verify_math_reward/std": 0.4993011951446533, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0019100851532130037, + "clip_ratio/high_mean": 0.0007693097859373665, + "clip_ratio/low_mean": 0.0006120883754192619, + "clip_ratio/low_min": 1.1038502634619363e-05, + "clip_ratio/region_mean": 0.0013813981604471337, + "epoch": 0.3872849227179936, + "grad_norm": 0.13167965412139893, + "learning_rate": 1e-06, + "loss": -0.0109, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0020490221177169587, + "clip_ratio/high_mean": 0.0007186526363511803, + "clip_ratio/low_mean": 0.0005681742277374724, + "clip_ratio/low_min": 1.5794794308021665e-05, + "clip_ratio/region_mean": 0.0012868268458987586, + "epoch": 0.389617964421114, + "grad_norm": 0.13167418539524078, + "learning_rate": 1e-06, + "loss": -0.011, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0022456580627476797, + "clip_ratio/high_mean": 0.0008425064788752934, + "clip_ratio/low_mean": 0.000783710835094098, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001626217275770614, + "epoch": 0.39195100612423445, + "grad_norm": 0.1238115131855011, + "learning_rate": 1e-06, + "loss": -0.0111, + "step": 168 + }, + { + "clip_ratio/high_max": 0.002102460159221664, + "clip_ratio/high_mean": 0.0007984250751178479, + "clip_ratio/low_mean": 0.0005976143966108793, + "clip_ratio/low_min": 1.253258506039856e-05, + "clip_ratio/region_mean": 0.0013960394826426636, + "completions/clipped_ratio": 0.0323660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3499.0, + "completions/mean_length": 747.599365234375, + "completions/mean_terminated_length": 635.5997314453125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.39428404782735493, + "grad_norm": 0.14102819561958313, + "learning_rate": 1e-06, + "loss": 0.0138, + "num_tokens": 26103294.0, + "reward": 0.6015625, + "reward_std": 0.20320719480514526, + "rewards/verify_math_reward/mean": 0.6015625, + "rewards/verify_math_reward/std": 0.48984986543655396, + "step": 169 + }, + { + "clip_ratio/high_max": 0.002048708603979321, + "clip_ratio/high_mean": 0.000856896496770787, + "clip_ratio/low_mean": 0.0007294035949598765, + "clip_ratio/low_min": 2.252387821499724e-05, + "clip_ratio/region_mean": 0.0015863000808167271, + "epoch": 0.39661708953047536, + "grad_norm": 0.13521471619606018, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0022176205384312198, + "clip_ratio/high_mean": 0.0008204550431401003, + "clip_ratio/low_mean": 0.00080624979455024, + "clip_ratio/low_min": 1.025935671350453e-05, + "clip_ratio/region_mean": 0.0016267048486042768, + "epoch": 0.3989501312335958, + "grad_norm": 0.132305309176445, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 171 + }, + { + "clip_ratio/high_max": 0.002695338276680559, + "clip_ratio/high_mean": 0.0010746097032097168, + "clip_ratio/low_mean": 0.0008721355006855447, + "clip_ratio/low_min": 3.679356450447813e-05, + "clip_ratio/region_mean": 0.0019467451784294099, + "epoch": 0.4012831729367162, + "grad_norm": 0.12492279708385468, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0018454058672432438, + "clip_ratio/high_mean": 0.0007141997180042381, + "clip_ratio/low_mean": 0.0004877425571976346, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012019422865705565, + "completions/clipped_ratio": 0.0279017857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2615.0, + "completions/mean_length": 666.3002319335938, + "completions/mean_terminated_length": 567.8587646484375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.4036162146398367, + "grad_norm": 0.14558936655521393, + "learning_rate": 1e-06, + "loss": -0.0112, + "num_tokens": 26684091.0, + "reward": 0.6194196939468384, + "reward_std": 0.17836888134479523, + "rewards/verify_math_reward/mean": 0.6194196343421936, + "rewards/verify_math_reward/std": 0.48580074310302734, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0020503912965068594, + "clip_ratio/high_mean": 0.0008501818829245167, + "clip_ratio/low_mean": 0.0005826316228194628, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001432813543942757, + "epoch": 0.40594925634295714, + "grad_norm": 0.13812094926834106, + "learning_rate": 1e-06, + "loss": -0.0113, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0021088762587169185, + "clip_ratio/high_mean": 0.000818512820842443, + "clip_ratio/low_mean": 0.0007027353531157132, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001521248159406241, + "epoch": 0.4082822980460776, + "grad_norm": 0.13025124371051788, + "learning_rate": 1e-06, + "loss": -0.0114, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0024314813636010513, + "clip_ratio/high_mean": 0.0009654291134211235, + "clip_ratio/low_mean": 0.0008212085613195086, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017866376874735579, + "epoch": 0.410615339749198, + "grad_norm": 0.1292300820350647, + "learning_rate": 1e-06, + "loss": -0.0115, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0019029411487281322, + "clip_ratio/high_mean": 0.0007594223316118587, + "clip_ratio/low_mean": 0.0005212702262724633, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001280692555155838, + "completions/clipped_ratio": 0.056919642857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3785.0, + "completions/mean_length": 846.482177734375, + "completions/mean_terminated_length": 650.357421875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.41294838145231844, + "grad_norm": 0.14513805508613586, + "learning_rate": 1e-06, + "loss": -0.0296, + "num_tokens": 27327035.0, + "reward": 0.5915178656578064, + "reward_std": 0.195994034409523, + "rewards/verify_math_reward/mean": 0.5915178656578064, + "rewards/verify_math_reward/std": 0.49182769656181335, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0019132906018057838, + "clip_ratio/high_mean": 0.0008263411073130555, + "clip_ratio/low_mean": 0.0006613379118789453, + "clip_ratio/low_min": 4.798547797690844e-05, + "clip_ratio/region_mean": 0.0014876790010021068, + "epoch": 0.4152814231554389, + "grad_norm": 0.13300026953220367, + "learning_rate": 1e-06, + "loss": -0.0297, + "step": 178 + }, + { + "clip_ratio/high_max": 0.002202157396823168, + "clip_ratio/high_mean": 0.0009413544539711438, + "clip_ratio/low_mean": 0.0007774288296786835, + "clip_ratio/low_min": 2.999307071149815e-05, + "clip_ratio/region_mean": 0.0017187832781928591, + "epoch": 0.41761446485855935, + "grad_norm": 0.13014237582683563, + "learning_rate": 1e-06, + "loss": -0.0298, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0022671789629384875, + "clip_ratio/high_mean": 0.0009622640500310808, + "clip_ratio/low_mean": 0.0008495632300764555, + "clip_ratio/low_min": 6.966615637793439e-05, + "clip_ratio/region_mean": 0.0018118272419087589, + "epoch": 0.4199475065616798, + "grad_norm": 0.127271369099617, + "learning_rate": 1e-06, + "loss": -0.0299, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0017390164030075539, + "clip_ratio/high_mean": 0.000820291770651238, + "clip_ratio/low_mean": 0.0005455044865811942, + "clip_ratio/low_min": 1.134713147621369e-05, + "clip_ratio/region_mean": 0.0013657962263096124, + "completions/clipped_ratio": 0.041294642857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3679.0, + "completions/mean_length": 823.5189819335938, + "completions/mean_terminated_length": 682.5623168945312, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.4222805482648002, + "grad_norm": 0.13623927533626556, + "learning_rate": 1e-06, + "loss": -0.0081, + "num_tokens": 28001804.0, + "reward": 0.5691964626312256, + "reward_std": 0.20613498985767365, + "rewards/verify_math_reward/mean": 0.5691964030265808, + "rewards/verify_math_reward/std": 0.4954652488231659, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0019057195422647055, + "clip_ratio/high_mean": 0.0009002240440167952, + "clip_ratio/low_mean": 0.0007010096378508024, + "clip_ratio/low_min": 2.528395452827681e-05, + "clip_ratio/region_mean": 0.0016012336964195129, + "epoch": 0.4246135899679207, + "grad_norm": 0.1361505389213562, + "learning_rate": 1e-06, + "loss": -0.0082, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0020468775073823053, + "clip_ratio/high_mean": 0.000950065310462378, + "clip_ratio/low_mean": 0.0007298784039448947, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001679943688941421, + "epoch": 0.42694663167104113, + "grad_norm": 0.16369010508060455, + "learning_rate": 1e-06, + "loss": -0.0083, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0023009092692518607, + "clip_ratio/high_mean": 0.0010173881219088798, + "clip_ratio/low_mean": 0.0009192527759296354, + "clip_ratio/low_min": 2.4488183044013567e-05, + "clip_ratio/region_mean": 0.0019366408450878225, + "epoch": 0.42927967337416156, + "grad_norm": 0.12531021237373352, + "learning_rate": 1e-06, + "loss": -0.0084, + "step": 184 + }, + { + "clip_ratio/high_max": 0.001886182013549842, + "clip_ratio/high_mean": 0.0008456229570583673, + "clip_ratio/low_mean": 0.0006428303222492104, + "clip_ratio/low_min": 4.2322684748796746e-05, + "clip_ratio/region_mean": 0.0014884532793075778, + "completions/clipped_ratio": 0.0479910714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4070.0, + "completions/mean_length": 827.0301513671875, + "completions/mean_terminated_length": 662.2402954101562, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.431612715077282, + "grad_norm": 0.15010322630405426, + "learning_rate": 1e-06, + "loss": -0.0112, + "num_tokens": 28651591.0, + "reward": 0.5803571939468384, + "reward_std": 0.21241775155067444, + "rewards/verify_math_reward/mean": 0.5803571343421936, + "rewards/verify_math_reward/std": 0.4937761127948761, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0018301246491319034, + "clip_ratio/high_mean": 0.0008700616945134243, + "clip_ratio/low_mean": 0.0007397959761874517, + "clip_ratio/low_min": 4.726601855509216e-05, + "clip_ratio/region_mean": 0.001609857652510982, + "epoch": 0.4339457567804024, + "grad_norm": 0.14428773522377014, + "learning_rate": 1e-06, + "loss": -0.0114, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0021695718314731494, + "clip_ratio/high_mean": 0.0010226818194496445, + "clip_ratio/low_mean": 0.0008679816110088723, + "clip_ratio/low_min": 5.975446765660308e-05, + "clip_ratio/region_mean": 0.0018906634286395274, + "epoch": 0.4362787984835229, + "grad_norm": 0.14368951320648193, + "learning_rate": 1e-06, + "loss": -0.0115, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0022048446626286022, + "clip_ratio/high_mean": 0.0010107983544003218, + "clip_ratio/low_mean": 0.0009332002828159602, + "clip_ratio/low_min": 0.00012687695470958715, + "clip_ratio/region_mean": 0.001943998628121335, + "epoch": 0.43861184018664334, + "grad_norm": 0.13704383373260498, + "learning_rate": 1e-06, + "loss": -0.0116, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0022271957059274428, + "clip_ratio/high_mean": 0.0008861481364874635, + "clip_ratio/low_mean": 0.0005503760521605727, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014365241913765203, + "completions/clipped_ratio": 0.060267857142857095, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2837.0, + "completions/mean_length": 823.7098388671875, + "completions/mean_terminated_length": 613.8480224609375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.4409448818897638, + "grad_norm": 0.1536131203174591, + "learning_rate": 1e-06, + "loss": -0.0198, + "num_tokens": 29263099.0, + "reward": 0.5479910969734192, + "reward_std": 0.1965574473142624, + "rewards/verify_math_reward/mean": 0.5479910969734192, + "rewards/verify_math_reward/std": 0.49796950817108154, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0025035149592440575, + "clip_ratio/high_mean": 0.0010717627228586935, + "clip_ratio/low_mean": 0.0006566193624166772, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017283820998272859, + "epoch": 0.4432779235928842, + "grad_norm": 0.1450500190258026, + "learning_rate": 1e-06, + "loss": -0.0199, + "step": 190 + }, + { + "clip_ratio/high_max": 0.002502236246073153, + "clip_ratio/high_mean": 0.0010475442395545542, + "clip_ratio/low_mean": 0.0008652243286633166, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019127685518469661, + "epoch": 0.4456109652960047, + "grad_norm": 0.14346785843372345, + "learning_rate": 1e-06, + "loss": -0.0201, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0028825998524553142, + "clip_ratio/high_mean": 0.0011423403193475679, + "clip_ratio/low_mean": 0.0009022599560921662, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002044600318185985, + "epoch": 0.4479440069991251, + "grad_norm": 0.13926590979099274, + "learning_rate": 1e-06, + "loss": -0.0202, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0017455844244977925, + "clip_ratio/high_mean": 0.0007080485738697462, + "clip_ratio/low_mean": 0.0004908380342385499, + "clip_ratio/low_min": 1.2672343473241199e-05, + "clip_ratio/region_mean": 0.0011988865880994126, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3357.0, + "completions/mean_length": 821.0848388671875, + "completions/mean_terminated_length": 631.6268920898438, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.45027704870224555, + "grad_norm": 0.15021604299545288, + "learning_rate": 1e-06, + "loss": -0.0227, + "num_tokens": 29881103.0, + "reward": 0.527901828289032, + "reward_std": 0.17585225403308868, + "rewards/verify_math_reward/mean": 0.5279017686843872, + "rewards/verify_math_reward/std": 0.49949970841407776, + "step": 193 + }, + { + "clip_ratio/high_max": 0.002242021197162103, + "clip_ratio/high_mean": 0.000786731641710503, + "clip_ratio/low_mean": 0.0006091638706493541, + "clip_ratio/low_min": 5.467925439006649e-05, + "clip_ratio/region_mean": 0.0013958955023554154, + "epoch": 0.452610090405366, + "grad_norm": 0.13321596384048462, + "learning_rate": 1e-06, + "loss": -0.0229, + "step": 194 + }, + { + "clip_ratio/high_max": 0.002216878390754573, + "clip_ratio/high_mean": 0.0008677324040036183, + "clip_ratio/low_mean": 0.0007575850731882383, + "clip_ratio/low_min": 2.8709254365821835e-05, + "clip_ratio/region_mean": 0.001625317454454489, + "epoch": 0.4549431321084864, + "grad_norm": 0.12558101117610931, + "learning_rate": 1e-06, + "loss": -0.0231, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0021615528021357022, + "clip_ratio/high_mean": 0.0008111449260468362, + "clip_ratio/low_mean": 0.0008866158423188608, + "clip_ratio/low_min": 6.265901265578577e-05, + "clip_ratio/region_mean": 0.0016977607738226652, + "epoch": 0.4572761738116069, + "grad_norm": 0.12740087509155273, + "learning_rate": 1e-06, + "loss": -0.0231, + "step": 196 + }, + { + "clip_ratio/high_max": 0.002232287883089157, + "clip_ratio/high_mean": 0.0009673337535787141, + "clip_ratio/low_mean": 0.0005202810407354264, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001487614783400204, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3615.0, + "completions/mean_length": 888.4888916015625, + "completions/mean_terminated_length": 616.6658935546875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.45960921551472733, + "grad_norm": 0.15869812667369843, + "learning_rate": 1e-06, + "loss": -0.0224, + "num_tokens": 30493005.0, + "reward": 0.5814732313156128, + "reward_std": 0.191481813788414, + "rewards/verify_math_reward/mean": 0.5814732313156128, + "rewards/verify_math_reward/std": 0.4935929775238037, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0022578981042897794, + "clip_ratio/high_mean": 0.001041828054439975, + "clip_ratio/low_mean": 0.0005890234124308336, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016308514786942396, + "epoch": 0.46194225721784776, + "grad_norm": 0.140419140458107, + "learning_rate": 1e-06, + "loss": -0.0225, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0025456565563217737, + "clip_ratio/high_mean": 0.0011061316454288317, + "clip_ratio/low_mean": 0.0007613937241330859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001867525355919497, + "epoch": 0.4642752989209682, + "grad_norm": 0.1353747397661209, + "learning_rate": 1e-06, + "loss": -0.0226, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0024561806458223145, + "clip_ratio/high_mean": 0.0011150241025461582, + "clip_ratio/low_mean": 0.0008858814271661686, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020009055187983904, + "epoch": 0.4666083406240887, + "grad_norm": 0.13788971304893494, + "learning_rate": 1e-06, + "loss": -0.0228, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0020815222633245867, + "clip_ratio/high_mean": 0.0008058053899731021, + "clip_ratio/low_mean": 0.0007494486671930645, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015552540426142514, + "completions/clipped_ratio": 0.0714285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3908.0, + "completions/mean_length": 894.0234985351562, + "completions/mean_terminated_length": 647.7175903320312, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.4689413823272091, + "grad_norm": 0.16608601808547974, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 31113114.0, + "reward": 0.606026828289032, + "reward_std": 0.19621339440345764, + "rewards/verify_math_reward/mean": 0.6060267686843872, + "rewards/verify_math_reward/std": 0.48890194296836853, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0024526705165044405, + "clip_ratio/high_mean": 0.0009260617298423313, + "clip_ratio/low_mean": 0.0008891882062016521, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001815249961509835, + "epoch": 0.47127442403032954, + "grad_norm": 0.15323638916015625, + "learning_rate": 1e-06, + "loss": -0.0031, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0025142339000012726, + "clip_ratio/high_mean": 0.0010169709958063322, + "clip_ratio/low_mean": 0.0009712312239571474, + "clip_ratio/low_min": 1.633986903470941e-05, + "clip_ratio/region_mean": 0.0019882022024830803, + "epoch": 0.47360746573345, + "grad_norm": 0.14349707961082458, + "learning_rate": 1e-06, + "loss": -0.0033, + "step": 203 + }, + { + "clip_ratio/high_max": 0.002481605850334745, + "clip_ratio/high_mean": 0.0010075547861561063, + "clip_ratio/low_mean": 0.001105919905967312, + "clip_ratio/low_min": 3.267973806941882e-05, + "clip_ratio/region_mean": 0.0021134746784809977, + "epoch": 0.4759405074365704, + "grad_norm": 0.14359316229820251, + "learning_rate": 1e-06, + "loss": -0.0033, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0021208442267379723, + "clip_ratio/high_mean": 0.0008455073002551217, + "clip_ratio/low_mean": 0.000589847798437404, + "clip_ratio/low_min": 1.098997745430097e-05, + "clip_ratio/region_mean": 0.0014353550868690945, + "completions/clipped_ratio": 0.0647321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3849.0, + "completions/mean_length": 836.6183471679688, + "completions/mean_terminated_length": 611.0286865234375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.4782735491396909, + "grad_norm": 0.1829909235239029, + "learning_rate": 1e-06, + "loss": -0.0118, + "num_tokens": 31704844.0, + "reward": 0.6350446939468384, + "reward_std": 0.17585085332393646, + "rewards/verify_math_reward/mean": 0.6350446343421936, + "rewards/verify_math_reward/std": 0.481686532497406, + "step": 205 + }, + { + "clip_ratio/high_max": 0.00222759220196167, + "clip_ratio/high_mean": 0.0009623481091693975, + "clip_ratio/low_mean": 0.0005849398330610711, + "clip_ratio/low_min": 2.197995490860194e-05, + "clip_ratio/region_mean": 0.0015472879240405746, + "epoch": 0.4806065908428113, + "grad_norm": 0.1492559164762497, + "learning_rate": 1e-06, + "loss": -0.0119, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0026327979649067856, + "clip_ratio/high_mean": 0.0010532176293054363, + "clip_ratio/low_mean": 0.0006820712005719543, + "clip_ratio/low_min": 2.197995490860194e-05, + "clip_ratio/region_mean": 0.00173528883169638, + "epoch": 0.48293963254593175, + "grad_norm": 0.14454133808612823, + "learning_rate": 1e-06, + "loss": -0.0121, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0027388943490223028, + "clip_ratio/high_mean": 0.001097951564588584, + "clip_ratio/low_mean": 0.0009181690720652114, + "clip_ratio/low_min": 1.605858233233448e-05, + "clip_ratio/region_mean": 0.002016120655753184, + "epoch": 0.4852726742490522, + "grad_norm": 0.13695617020130157, + "learning_rate": 1e-06, + "loss": -0.0122, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0018147952578146942, + "clip_ratio/high_mean": 0.0006773728041480354, + "clip_ratio/low_mean": 0.0006203562052178313, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012977289879927412, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3284.0, + "completions/mean_length": 784.1239013671875, + "completions/mean_terminated_length": 592.5277099609375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.48760571595217267, + "grad_norm": 0.15552891790866852, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 32292467.0, + "reward": 0.5848214626312256, + "reward_std": 0.17130544781684875, + "rewards/verify_math_reward/mean": 0.5848214030265808, + "rewards/verify_math_reward/std": 0.49302801489830017, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0021525007468881086, + "clip_ratio/high_mean": 0.0008598787881055614, + "clip_ratio/low_mean": 0.0007220445304483292, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015819232976355124, + "epoch": 0.4899387576552931, + "grad_norm": 0.15443633496761322, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0023462451645173132, + "clip_ratio/high_mean": 0.0009156435035038157, + "clip_ratio/low_mean": 0.000828188593914092, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017438321083318442, + "epoch": 0.49227179935841353, + "grad_norm": 0.1460455358028412, + "learning_rate": 1e-06, + "loss": 0.002, + "step": 211 + }, + { + "clip_ratio/high_max": 0.002625642860948574, + "clip_ratio/high_mean": 0.0009293550920119742, + "clip_ratio/low_mean": 0.0009462321759201586, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018755872806650586, + "epoch": 0.49460484106153396, + "grad_norm": 0.1429179459810257, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0020073093110113405, + "clip_ratio/high_mean": 0.000759914846639731, + "clip_ratio/low_mean": 0.000462331128801452, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012222459736221936, + "completions/clipped_ratio": 0.0513392857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3670.0, + "completions/mean_length": 747.1752319335938, + "completions/mean_terminated_length": 565.9447021484375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.4969378827646544, + "grad_norm": 0.15830470621585846, + "learning_rate": 1e-06, + "loss": -0.027, + "num_tokens": 32863360.0, + "reward": 0.621651828289032, + "reward_std": 0.1690923273563385, + "rewards/verify_math_reward/mean": 0.6216517686843872, + "rewards/verify_math_reward/std": 0.4852459728717804, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0022311388893285766, + "clip_ratio/high_mean": 0.000997254599496955, + "clip_ratio/low_mean": 0.0006108485558797838, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016081031499197707, + "epoch": 0.4992709244677749, + "grad_norm": 0.1517784148454666, + "learning_rate": 1e-06, + "loss": -0.0272, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0021470280553330667, + "clip_ratio/high_mean": 0.0008577115968364524, + "clip_ratio/low_mean": 0.0007024550213827752, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001560166598210344, + "epoch": 0.5016039661708953, + "grad_norm": 0.13804104924201965, + "learning_rate": 1e-06, + "loss": -0.0273, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0028051892440998927, + "clip_ratio/high_mean": 0.0010142452265426982, + "clip_ratio/low_mean": 0.0007994899351615459, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018137351871700957, + "epoch": 0.5039370078740157, + "grad_norm": 0.13298538327217102, + "learning_rate": 1e-06, + "loss": -0.0274, + "step": 216 + }, + { + "clip_ratio/high_max": 0.001884418226836715, + "clip_ratio/high_mean": 0.0008349796298716683, + "clip_ratio/low_mean": 0.0004762308617500821, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013112104643369094, + "completions/clipped_ratio": 0.0401785714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2688.0, + "completions/mean_length": 732.818115234375, + "completions/mean_terminated_length": 592.03369140625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.5062700495771362, + "grad_norm": 0.15719586610794067, + "learning_rate": 1e-06, + "loss": -0.0224, + "num_tokens": 33466901.0, + "reward": 0.609375, + "reward_std": 0.18490804731845856, + "rewards/verify_math_reward/mean": 0.609375, + "rewards/verify_math_reward/std": 0.48816296458244324, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0024248765403172, + "clip_ratio/high_mean": 0.0009681946321506985, + "clip_ratio/low_mean": 0.0005426817997431499, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001510876463726163, + "epoch": 0.5086030912802566, + "grad_norm": 0.17562437057495117, + "learning_rate": 1e-06, + "loss": -0.0226, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0024607518353150226, + "clip_ratio/high_mean": 0.0010107837515533902, + "clip_ratio/low_mean": 0.000817063302747556, + "clip_ratio/low_min": 1.133478417614242e-05, + "clip_ratio/region_mean": 0.0018278470524819568, + "epoch": 0.510936132983377, + "grad_norm": 0.13347984850406647, + "learning_rate": 1e-06, + "loss": -0.0227, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0026186130417045206, + "clip_ratio/high_mean": 0.0010899415683525149, + "clip_ratio/low_mean": 0.0009008191700559109, + "clip_ratio/low_min": 2.7310465156915598e-05, + "clip_ratio/region_mean": 0.0019907606911147013, + "epoch": 0.5132691746864976, + "grad_norm": 0.13431869447231293, + "learning_rate": 1e-06, + "loss": -0.0228, + "step": 220 + }, + { + "clip_ratio/high_max": 0.001676053161645541, + "clip_ratio/high_mean": 0.0005422348167485325, + "clip_ratio/low_mean": 0.0005290484946272045, + "clip_ratio/low_min": 2.7246001081948634e-05, + "clip_ratio/region_mean": 0.0010712833209254313, + "completions/clipped_ratio": 0.0479910714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2962.0, + "completions/mean_length": 773.9029541015625, + "completions/mean_terminated_length": 606.4349365234375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.515602216389618, + "grad_norm": 0.1542210876941681, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 34075598.0, + "reward": 0.515625, + "reward_std": 0.16360442340373993, + "rewards/verify_math_reward/mean": 0.515625, + "rewards/verify_math_reward/std": 0.5000349283218384, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0019124368445773143, + "clip_ratio/high_mean": 0.0006477261977124726, + "clip_ratio/low_mean": 0.000581258345846436, + "clip_ratio/low_min": 3.816493881458882e-05, + "clip_ratio/region_mean": 0.0012289845508348662, + "epoch": 0.5179352580927384, + "grad_norm": 0.14712415635585785, + "learning_rate": 1e-06, + "loss": -0.0054, + "step": 222 + }, + { + "clip_ratio/high_max": 0.002208602600148879, + "clip_ratio/high_mean": 0.000756114433897892, + "clip_ratio/low_mean": 0.0008703372177478741, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016264516452793032, + "epoch": 0.5202682997958589, + "grad_norm": 0.13447299599647522, + "learning_rate": 1e-06, + "loss": -0.0056, + "step": 223 + }, + { + "clip_ratio/high_max": 0.002081741469737608, + "clip_ratio/high_mean": 0.0007299962762772338, + "clip_ratio/low_mean": 0.0009493385659880005, + "clip_ratio/low_min": 4.9603173465584405e-05, + "clip_ratio/region_mean": 0.0016793348258943297, + "epoch": 0.5226013414989793, + "grad_norm": 0.13474909961223602, + "learning_rate": 1e-06, + "loss": -0.0056, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0017302923006354831, + "clip_ratio/high_mean": 0.0007095045375535847, + "clip_ratio/low_mean": 0.0005966503613308305, + "clip_ratio/low_min": 3.647505218395963e-05, + "clip_ratio/region_mean": 0.0013061549034318887, + "completions/clipped_ratio": 0.049107142857142905, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2674.0, + "completions/mean_length": 749.3158569335938, + "completions/mean_terminated_length": 576.482421875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.5249343832020997, + "grad_norm": 0.16695386171340942, + "learning_rate": 1e-06, + "loss": -0.0156, + "num_tokens": 34653513.0, + "reward": 0.5892857313156128, + "reward_std": 0.17799869179725647, + "rewards/verify_math_reward/mean": 0.5892857313156128, + "rewards/verify_math_reward/std": 0.49223825335502625, + "step": 225 + }, + { + "clip_ratio/high_max": 0.002407783606031444, + "clip_ratio/high_mean": 0.0008509271483490011, + "clip_ratio/low_mean": 0.0007258261230163043, + "clip_ratio/low_min": 1.885938399937004e-05, + "clip_ratio/region_mean": 0.0015767532750032842, + "epoch": 0.5272674249052202, + "grad_norm": 0.1526087522506714, + "learning_rate": 1e-06, + "loss": -0.0158, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0023025913833407685, + "clip_ratio/high_mean": 0.00088137284910772, + "clip_ratio/low_mean": 0.000892662950718659, + "clip_ratio/low_min": 4.7148459998425096e-05, + "clip_ratio/region_mean": 0.0017740357870934531, + "epoch": 0.5296004666083406, + "grad_norm": 0.14756852388381958, + "learning_rate": 1e-06, + "loss": -0.016, + "step": 227 + }, + { + "clip_ratio/high_max": 0.002523296330764424, + "clip_ratio/high_mean": 0.000958762337177177, + "clip_ratio/low_mean": 0.000998503339360468, + "clip_ratio/low_min": 9.42969199968502e-06, + "clip_ratio/region_mean": 0.001957265689270571, + "epoch": 0.531933508311461, + "grad_norm": 0.14524367451667786, + "learning_rate": 1e-06, + "loss": -0.0161, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0017749884609656874, + "clip_ratio/high_mean": 0.0006844742229077383, + "clip_ratio/low_mean": 0.0005758627803515992, + "clip_ratio/low_min": 2.6606637220538687e-05, + "clip_ratio/region_mean": 0.0012603369905264117, + "completions/clipped_ratio": 0.044642857142857095, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3527.0, + "completions/mean_length": 783.2924194335938, + "completions/mean_terminated_length": 628.4929809570312, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.5342665500145816, + "grad_norm": 0.15385381877422333, + "learning_rate": 1e-06, + "loss": -0.02, + "num_tokens": 35281367.0, + "reward": 0.5524553656578064, + "reward_std": 0.1795709878206253, + "rewards/verify_math_reward/mean": 0.5524553656578064, + "rewards/verify_math_reward/std": 0.49751853942871094, + "step": 229 + }, + { + "clip_ratio/high_max": 0.002248025863082148, + "clip_ratio/high_mean": 0.0007510431696573505, + "clip_ratio/low_mean": 0.0006982658669585362, + "clip_ratio/low_min": 4.0437347706756555e-05, + "clip_ratio/region_mean": 0.0014493090311589185, + "epoch": 0.536599591717702, + "grad_norm": 0.14346085488796234, + "learning_rate": 1e-06, + "loss": -0.0203, + "step": 230 + }, + { + "clip_ratio/high_max": 0.002111926947691245, + "clip_ratio/high_mean": 0.0009057822080649203, + "clip_ratio/low_mean": 0.0007865216102800332, + "clip_ratio/low_min": 3.9903771721583325e-05, + "clip_ratio/region_mean": 0.001692303834715858, + "epoch": 0.5389326334208224, + "grad_norm": 0.13601410388946533, + "learning_rate": 1e-06, + "loss": -0.0203, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0023360310660791583, + "clip_ratio/high_mean": 0.0008537830599379959, + "clip_ratio/low_mean": 0.0009508076145721134, + "clip_ratio/low_min": 5.0289281716686673e-05, + "clip_ratio/region_mean": 0.0018045906108454801, + "epoch": 0.5412656751239429, + "grad_norm": 0.13489511609077454, + "learning_rate": 1e-06, + "loss": -0.0204, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0014880389171594288, + "clip_ratio/high_mean": 0.00048186242929659784, + "clip_ratio/low_mean": 0.00047997832007240504, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009618407475500135, + "completions/clipped_ratio": 0.0457589285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3521.0, + "completions/mean_length": 743.7467041015625, + "completions/mean_terminated_length": 582.9953002929688, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.5435987168270633, + "grad_norm": 0.13865098357200623, + "learning_rate": 1e-06, + "loss": -0.0151, + "num_tokens": 35873972.0, + "reward": 0.6071428656578064, + "reward_std": 0.11953012645244598, + "rewards/verify_math_reward/mean": 0.6071428656578064, + "rewards/verify_math_reward/std": 0.48865827918052673, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0019240232759329956, + "clip_ratio/high_mean": 0.0006175566004458233, + "clip_ratio/low_mean": 0.0005685450960299931, + "clip_ratio/low_min": 4.293565507396124e-05, + "clip_ratio/region_mean": 0.0011861016937473323, + "epoch": 0.5459317585301837, + "grad_norm": 0.13254310190677643, + "learning_rate": 1e-06, + "loss": -0.0152, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0022041650518076494, + "clip_ratio/high_mean": 0.0006964148251427105, + "clip_ratio/low_mean": 0.0007112952880561352, + "clip_ratio/low_min": 5.803156818728894e-05, + "clip_ratio/region_mean": 0.0014077101077418774, + "epoch": 0.5482648002333042, + "grad_norm": 0.12212701886892319, + "learning_rate": 1e-06, + "loss": -0.0154, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0022669806421617977, + "clip_ratio/high_mean": 0.0007154976665333379, + "clip_ratio/low_mean": 0.0007446610470651649, + "clip_ratio/low_min": 4.724111931864172e-05, + "clip_ratio/region_mean": 0.0014601587245124392, + "epoch": 0.5505978419364246, + "grad_norm": 0.1242891252040863, + "learning_rate": 1e-06, + "loss": -0.0154, + "step": 236 + }, + { + "clip_ratio/high_max": 0.001739940886182012, + "clip_ratio/high_mean": 0.0006690709469694411, + "clip_ratio/low_mean": 0.0006471121196227614, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013161830611352343, + "completions/clipped_ratio": 0.0379464285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3510.0, + "completions/mean_length": 718.7522583007812, + "completions/mean_terminated_length": 585.5429077148438, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.552930883639545, + "grad_norm": 0.19878889620304108, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 36463718.0, + "reward": 0.6283482313156128, + "reward_std": 0.16938656568527222, + "rewards/verify_math_reward/mean": 0.6283482313156128, + "rewards/verify_math_reward/std": 0.4835159480571747, + "step": 237 + }, + { + "clip_ratio/high_max": 0.002143624980817549, + "clip_ratio/high_mean": 0.0008554658179491526, + "clip_ratio/low_mean": 0.0008386970785068115, + "clip_ratio/low_min": 5.576920466410229e-05, + "clip_ratio/region_mean": 0.00169416287826607, + "epoch": 0.5552639253426656, + "grad_norm": 0.15399928390979767, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0023502816547988914, + "clip_ratio/high_mean": 0.0009067850733117666, + "clip_ratio/low_mean": 0.0009722007471282268, + "clip_ratio/low_min": 5.104478077555541e-05, + "clip_ratio/region_mean": 0.0018789857713272795, + "epoch": 0.557596967045786, + "grad_norm": 0.15232457220554352, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0023724731108814012, + "clip_ratio/high_mean": 0.0009244451975973789, + "clip_ratio/low_mean": 0.0010867376477108337, + "clip_ratio/low_min": 4.5410639359033667e-05, + "clip_ratio/region_mean": 0.0020111828271183185, + "epoch": 0.5599300087489064, + "grad_norm": 0.14790889620780945, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0016735052049625665, + "clip_ratio/high_mean": 0.0006779307495889952, + "clip_ratio/low_mean": 0.0006132294238341274, + "clip_ratio/low_min": 2.033725832006894e-05, + "clip_ratio/region_mean": 0.0012911601916130167, + "completions/clipped_ratio": 0.0691964285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2889.0, + "completions/mean_length": 889.2645263671875, + "completions/mean_terminated_length": 650.8741455078125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.5622630504520268, + "grad_norm": 0.16441886126995087, + "learning_rate": 1e-06, + "loss": -0.0141, + "num_tokens": 37090571.0, + "reward": 0.5691964626312256, + "reward_std": 0.18403972685337067, + "rewards/verify_math_reward/mean": 0.5691964030265808, + "rewards/verify_math_reward/std": 0.4954652488231659, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0018577990049379878, + "clip_ratio/high_mean": 0.0007666599976801081, + "clip_ratio/low_mean": 0.0006816638942837017, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001448323866497958, + "epoch": 0.5645960921551473, + "grad_norm": 0.14850129187107086, + "learning_rate": 1e-06, + "loss": -0.0142, + "step": 242 + }, + { + "clip_ratio/high_max": 0.002126586354279425, + "clip_ratio/high_mean": 0.0009070793985301862, + "clip_ratio/low_mean": 0.0008444239938398823, + "clip_ratio/low_min": 2.9691153940802906e-05, + "clip_ratio/region_mean": 0.0017515034051029943, + "epoch": 0.5669291338582677, + "grad_norm": 0.13421630859375, + "learning_rate": 1e-06, + "loss": -0.0144, + "step": 243 + }, + { + "clip_ratio/high_max": 0.002174347777327057, + "clip_ratio/high_mean": 0.0008673456795804668, + "clip_ratio/low_mean": 0.0009673706917965319, + "clip_ratio/low_min": 7.436059331666911e-05, + "clip_ratio/region_mean": 0.001834716422308702, + "epoch": 0.5692621755613881, + "grad_norm": 0.132176011800766, + "learning_rate": 1e-06, + "loss": -0.0145, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0028683393175015226, + "clip_ratio/high_mean": 0.0010127493369509466, + "clip_ratio/low_mean": 0.00075934248707199, + "clip_ratio/low_min": 1.3736264008912258e-05, + "clip_ratio/region_mean": 0.0017720918040140532, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3895.0, + "completions/mean_length": 881.755615234375, + "completions/mean_terminated_length": 667.47265625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.5715952172645086, + "grad_norm": 0.18064840137958527, + "learning_rate": 1e-06, + "loss": -0.0202, + "num_tokens": 37735216.0, + "reward": 0.5412946939468384, + "reward_std": 0.22469766438007355, + "rewards/verify_math_reward/mean": 0.5412946343421936, + "rewards/verify_math_reward/std": 0.49857014417648315, + "step": 245 + }, + { + "clip_ratio/high_max": 0.002810091638821177, + "clip_ratio/high_mean": 0.0011130381317343563, + "clip_ratio/low_mean": 0.0009102133517444599, + "clip_ratio/low_min": 4.440503107616678e-05, + "clip_ratio/region_mean": 0.0020232514580129646, + "epoch": 0.573928258967629, + "grad_norm": 0.1686396300792694, + "learning_rate": 1e-06, + "loss": -0.0204, + "step": 246 + }, + { + "clip_ratio/high_max": 0.003302433338831179, + "clip_ratio/high_mean": 0.0012588584941113368, + "clip_ratio/low_mean": 0.0010801408207044005, + "clip_ratio/low_min": 5.4901300245546736e-05, + "clip_ratio/region_mean": 0.0023389992784359492, + "epoch": 0.5762613006707495, + "grad_norm": 0.16458149254322052, + "learning_rate": 1e-06, + "loss": -0.0206, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0036235110528650694, + "clip_ratio/high_mean": 0.0013374627342273016, + "clip_ratio/low_mean": 0.0011865439810208045, + "clip_ratio/low_min": 0.00012475728908611927, + "clip_ratio/region_mean": 0.002524006675230339, + "epoch": 0.57859434237387, + "grad_norm": 0.15335099399089813, + "learning_rate": 1e-06, + "loss": -0.0207, + "step": 248 + }, + { + "clip_ratio/high_max": 0.002065000957372831, + "clip_ratio/high_mean": 0.0008984474952740129, + "clip_ratio/low_mean": 0.00064315901727241, + "clip_ratio/low_min": 1.1287700544926338e-05, + "clip_ratio/region_mean": 0.0015416065143654123, + "completions/clipped_ratio": 0.0580357142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3344.0, + "completions/mean_length": 811.1272583007812, + "completions/mean_terminated_length": 608.74169921875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.5809273840769904, + "grad_norm": 0.1811312586069107, + "learning_rate": 1e-06, + "loss": -0.0335, + "num_tokens": 38327650.0, + "reward": 0.6328125, + "reward_std": 0.19808951020240784, + "rewards/verify_math_reward/mean": 0.6328125, + "rewards/verify_math_reward/std": 0.48230743408203125, + "step": 249 + }, + { + "clip_ratio/high_max": 0.002202488452894613, + "clip_ratio/high_mean": 0.0009702885690785479, + "clip_ratio/low_mean": 0.0008369570678041782, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018072456441586837, + "epoch": 0.5832604257801108, + "grad_norm": 0.17485851049423218, + "learning_rate": 1e-06, + "loss": -0.0337, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0023402113292831928, + "clip_ratio/high_mean": 0.0010030769644799875, + "clip_ratio/low_mean": 0.0009955549285223242, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019986318657174706, + "epoch": 0.5855934674832313, + "grad_norm": 0.15953102707862854, + "learning_rate": 1e-06, + "loss": -0.0338, + "step": 251 + }, + { + "clip_ratio/high_max": 0.002859046173398383, + "clip_ratio/high_mean": 0.0012013425111945253, + "clip_ratio/low_mean": 0.001164675115433056, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023660176739213057, + "epoch": 0.5879265091863517, + "grad_norm": 0.15245676040649414, + "learning_rate": 1e-06, + "loss": -0.034, + "step": 252 + }, + { + "clip_ratio/high_max": 0.002454886576742865, + "clip_ratio/high_mean": 0.0011808079943875782, + "clip_ratio/low_mean": 0.0007260778120325995, + "clip_ratio/low_min": 3.780636416195193e-05, + "clip_ratio/region_mean": 0.001906885787320789, + "completions/clipped_ratio": 0.0613839285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3406.0, + "completions/mean_length": 845.630615234375, + "completions/mean_terminated_length": 633.0618286132812, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.5902595508894721, + "grad_norm": 0.19520410895347595, + "learning_rate": 1e-06, + "loss": -0.0342, + "num_tokens": 38949823.0, + "reward": 0.6316964626312256, + "reward_std": 0.23886480927467346, + "rewards/verify_math_reward/mean": 0.6316964030265808, + "rewards/verify_math_reward/std": 0.4826137125492096, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0029217097981018014, + "clip_ratio/high_mean": 0.0012666904185607564, + "clip_ratio/low_mean": 0.0009204373054672033, + "clip_ratio/low_min": 7.578930308227427e-05, + "clip_ratio/region_mean": 0.002187127734941896, + "epoch": 0.5925925925925926, + "grad_norm": 0.19551533460617065, + "learning_rate": 1e-06, + "loss": -0.0344, + "step": 254 + }, + { + "clip_ratio/high_max": 0.003028957042261027, + "clip_ratio/high_mean": 0.0013827830480295233, + "clip_ratio/low_mean": 0.0011081993379775668, + "clip_ratio/low_min": 5.027349016017979e-05, + "clip_ratio/region_mean": 0.0024909823696361855, + "epoch": 0.594925634295713, + "grad_norm": 0.1766534298658371, + "learning_rate": 1e-06, + "loss": -0.0346, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0029570692277047783, + "clip_ratio/high_mean": 0.001343818676105002, + "clip_ratio/low_mean": 0.0012197565392852994, + "clip_ratio/low_min": 5.0371709221508354e-05, + "clip_ratio/region_mean": 0.002563575231761206, + "epoch": 0.5972586759988335, + "grad_norm": 0.16498038172721863, + "learning_rate": 1e-06, + "loss": -0.0347, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0020004543803224806, + "clip_ratio/high_mean": 0.0007045400034257909, + "clip_ratio/low_mean": 0.0006450886503444053, + "clip_ratio/low_min": 3.832004949799739e-05, + "clip_ratio/region_mean": 0.0013496286410372704, + "completions/clipped_ratio": 0.052455357142857095, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4033.0, + "completions/mean_length": 819.6406860351562, + "completions/mean_terminated_length": 638.2638549804688, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.599591717701954, + "grad_norm": 0.174686998128891, + "learning_rate": 1e-06, + "loss": -0.0272, + "num_tokens": 39589389.0, + "reward": 0.5703125, + "reward_std": 0.18776056170463562, + "rewards/verify_math_reward/mean": 0.5703125, + "rewards/verify_math_reward/std": 0.49530795216560364, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0021911398507654667, + "clip_ratio/high_mean": 0.0008066736936598318, + "clip_ratio/low_mean": 0.000846038323288667, + "clip_ratio/low_min": 2.730450069066137e-05, + "clip_ratio/region_mean": 0.0016527120096725412, + "epoch": 0.6019247594050744, + "grad_norm": 0.15377922356128693, + "learning_rate": 1e-06, + "loss": -0.0274, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0026595802191877738, + "clip_ratio/high_mean": 0.0009453495586058125, + "clip_ratio/low_mean": 0.0009764476362761343, + "clip_ratio/low_min": 1.9160024748998694e-05, + "clip_ratio/region_mean": 0.0019217972112528514, + "epoch": 0.6042578011081948, + "grad_norm": 0.14722417294979095, + "learning_rate": 1e-06, + "loss": -0.0276, + "step": 259 + }, + { + "clip_ratio/high_max": 0.002826516800269019, + "clip_ratio/high_mean": 0.0009853889441728825, + "clip_ratio/low_mean": 0.0011208343730686465, + "clip_ratio/low_min": 2.603443499538116e-05, + "clip_ratio/region_mean": 0.002106223335431423, + "epoch": 0.6065908428113153, + "grad_norm": 0.14746461808681488, + "learning_rate": 1e-06, + "loss": -0.0277, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0017576406498847064, + "clip_ratio/high_mean": 0.0005976478887532721, + "clip_ratio/low_mean": 0.0004928772068524268, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010905250819632784, + "completions/clipped_ratio": 0.0558035714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3622.0, + "completions/mean_length": 769.7455444335938, + "completions/mean_terminated_length": 573.1583862304688, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.6089238845144357, + "grad_norm": 0.16936379671096802, + "learning_rate": 1e-06, + "loss": -0.0272, + "num_tokens": 40172817.0, + "reward": 0.598214328289032, + "reward_std": 0.1515413373708725, + "rewards/verify_math_reward/mean": 0.5982142686843872, + "rewards/verify_math_reward/std": 0.49053290486335754, + "step": 261 + }, + { + "clip_ratio/high_max": 0.002137923140253406, + "clip_ratio/high_mean": 0.0007493480406992603, + "clip_ratio/low_mean": 0.000567885364034737, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013172333929105662, + "epoch": 0.6112569262175561, + "grad_norm": 0.15487205982208252, + "learning_rate": 1e-06, + "loss": -0.0274, + "step": 262 + }, + { + "clip_ratio/high_max": 0.002313453282113187, + "clip_ratio/high_mean": 0.0008186651230062125, + "clip_ratio/low_mean": 0.0007745958864688873, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001593260996742174, + "epoch": 0.6135899679206765, + "grad_norm": 0.13710205256938934, + "learning_rate": 1e-06, + "loss": -0.0276, + "step": 263 + }, + { + "clip_ratio/high_max": 0.002353799540287582, + "clip_ratio/high_mean": 0.0008824724536680151, + "clip_ratio/low_mean": 0.0008264293965112302, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017089018401748035, + "epoch": 0.615923009623797, + "grad_norm": 0.13875557482242584, + "learning_rate": 1e-06, + "loss": -0.0276, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0013810562340950128, + "clip_ratio/high_mean": 0.0005493980743267457, + "clip_ratio/low_mean": 0.0005788339622085914, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011282320301688742, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3763.0, + "completions/mean_length": 908.1875610351562, + "completions/mean_terminated_length": 695.6666870117188, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.6182560513269175, + "grad_norm": 0.14903157949447632, + "learning_rate": 1e-06, + "loss": -0.0152, + "num_tokens": 40852849.0, + "reward": 0.535714328289032, + "reward_std": 0.17731469869613647, + "rewards/verify_math_reward/mean": 0.5357142686843872, + "rewards/verify_math_reward/std": 0.4990014135837555, + "step": 265 + }, + { + "clip_ratio/high_max": 0.001720084110274911, + "clip_ratio/high_mean": 0.0006810845243307995, + "clip_ratio/low_mean": 0.000695541473760386, + "clip_ratio/low_min": 6.149285763967782e-05, + "clip_ratio/region_mean": 0.0013766260017291643, + "epoch": 0.620589093030038, + "grad_norm": 0.14406567811965942, + "learning_rate": 1e-06, + "loss": -0.0154, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0020672293176176026, + "clip_ratio/high_mean": 0.0007439389773935545, + "clip_ratio/low_mean": 0.0008149237601173809, + "clip_ratio/low_min": 6.404508076229831e-05, + "clip_ratio/region_mean": 0.00155886275388184, + "epoch": 0.6229221347331584, + "grad_norm": 0.14669474959373474, + "learning_rate": 1e-06, + "loss": -0.0155, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0019061784987570718, + "clip_ratio/high_mean": 0.0007410540856653824, + "clip_ratio/low_mean": 0.0009376859288749984, + "clip_ratio/low_min": 6.939301329111913e-05, + "clip_ratio/region_mean": 0.001678740001807455, + "epoch": 0.6252551764362788, + "grad_norm": 0.13039635121822357, + "learning_rate": 1e-06, + "loss": -0.0156, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0018028199483524077, + "clip_ratio/high_mean": 0.0006874569044157397, + "clip_ratio/low_mean": 0.0005106005064590136, + "clip_ratio/low_min": 2.9585884476546198e-05, + "clip_ratio/region_mean": 0.0011980574017798062, + "completions/clipped_ratio": 0.0770089285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3214.0, + "completions/mean_length": 888.19091796875, + "completions/mean_terminated_length": 620.5502319335938, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.6275882181393992, + "grad_norm": 0.15618275105953217, + "learning_rate": 1e-06, + "loss": -0.0256, + "num_tokens": 41449668.0, + "reward": 0.5658482313156128, + "reward_std": 0.1510535031557083, + "rewards/verify_math_reward/mean": 0.5658482313156128, + "rewards/verify_math_reward/std": 0.49592188000679016, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0019446795340627432, + "clip_ratio/high_mean": 0.0007613918933202513, + "clip_ratio/low_mean": 0.0007282111346285092, + "clip_ratio/low_min": 3.0350084671226796e-05, + "clip_ratio/region_mean": 0.001489602989749983, + "epoch": 0.6299212598425197, + "grad_norm": 0.14463922381401062, + "learning_rate": 1e-06, + "loss": -0.0258, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0021781206960440613, + "clip_ratio/high_mean": 0.0008448699973087059, + "clip_ratio/low_mean": 0.0008981303326436318, + "clip_ratio/low_min": 8.13642363937106e-05, + "clip_ratio/region_mean": 0.0017430003572371788, + "epoch": 0.6322543015456401, + "grad_norm": 0.1415518820285797, + "learning_rate": 1e-06, + "loss": -0.026, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0020730302603624295, + "clip_ratio/high_mean": 0.0008484179506922374, + "clip_ratio/low_mean": 0.000955949011768098, + "clip_ratio/low_min": 0.00013005232176510617, + "clip_ratio/region_mean": 0.0018043669479084201, + "epoch": 0.6345873432487605, + "grad_norm": 0.13381057977676392, + "learning_rate": 1e-06, + "loss": -0.026, + "step": 272 + }, + { + "clip_ratio/high_max": 0.002148325598682277, + "clip_ratio/high_mean": 0.0009207470575347543, + "clip_ratio/low_mean": 0.0006785067789678578, + "clip_ratio/low_min": 1.9409937522141263e-05, + "clip_ratio/region_mean": 0.0015992538355931174, + "completions/clipped_ratio": 0.0837053571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3847.0, + "completions/mean_length": 948.7210083007812, + "completions/mean_terminated_length": 661.2107543945312, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.636920384951881, + "grad_norm": 0.20500075817108154, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 42082890.0, + "reward": 0.5368303656578064, + "reward_std": 0.2144351601600647, + "rewards/verify_math_reward/mean": 0.5368303656578064, + "rewards/verify_math_reward/std": 0.49892017245292664, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0024067259291769005, + "clip_ratio/high_mean": 0.0010504097954253666, + "clip_ratio/low_mean": 0.0008104904227366205, + "clip_ratio/low_min": 1.625487675482873e-05, + "clip_ratio/region_mean": 0.0018609002800076269, + "epoch": 0.6392534266550015, + "grad_norm": 0.18786416947841644, + "learning_rate": 1e-06, + "loss": -0.0058, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0024999762536026537, + "clip_ratio/high_mean": 0.0010430742440803442, + "clip_ratio/low_mean": 0.0010387395932411891, + "clip_ratio/low_min": 8.285888179671019e-05, + "clip_ratio/region_mean": 0.0020818138364120387, + "epoch": 0.6415864683581219, + "grad_norm": 0.17506757378578186, + "learning_rate": 1e-06, + "loss": -0.006, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0027001566559192725, + "clip_ratio/high_mean": 0.001160879499366274, + "clip_ratio/low_mean": 0.0014031115624675294, + "clip_ratio/low_min": 0.0001541474302939605, + "clip_ratio/region_mean": 0.0025639910527388565, + "epoch": 0.6439195100612424, + "grad_norm": 0.19004221260547638, + "learning_rate": 1e-06, + "loss": -0.0061, + "step": 276 + }, + { + "clip_ratio/high_max": 0.001982006178877782, + "clip_ratio/high_mean": 0.0008011192785488674, + "clip_ratio/low_mean": 0.0008978966488939477, + "clip_ratio/low_min": 0.0001257303429156309, + "clip_ratio/region_mean": 0.0016990159565466456, + "completions/clipped_ratio": 0.0736607142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4062.0, + "completions/mean_length": 922.2232666015625, + "completions/mean_terminated_length": 669.8505859375, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.6462525517643628, + "grad_norm": 0.2027897983789444, + "learning_rate": 1e-06, + "loss": -0.0333, + "num_tokens": 42728906.0, + "reward": 0.5569196939468384, + "reward_std": 0.22837404906749725, + "rewards/verify_math_reward/mean": 0.5569196343421936, + "rewards/verify_math_reward/std": 0.49702703952789307, + "step": 277 + }, + { + "clip_ratio/high_max": 0.002469796600053087, + "clip_ratio/high_mean": 0.0010389416602265555, + "clip_ratio/low_mean": 0.001157696471636882, + "clip_ratio/low_min": 0.0002515920023142826, + "clip_ratio/region_mean": 0.00219663812458748, + "epoch": 0.6485855934674832, + "grad_norm": 0.18985936045646667, + "learning_rate": 1e-06, + "loss": -0.0335, + "step": 278 + }, + { + "clip_ratio/high_max": 0.002642264138557948, + "clip_ratio/high_mean": 0.0011488028867461253, + "clip_ratio/low_mean": 0.0012836683745263144, + "clip_ratio/low_min": 0.00027828193560708314, + "clip_ratio/region_mean": 0.0024324712867382914, + "epoch": 0.6509186351706037, + "grad_norm": 0.18310676515102386, + "learning_rate": 1e-06, + "loss": -0.0337, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0028919992182636634, + "clip_ratio/high_mean": 0.0011562096296984237, + "clip_ratio/low_mean": 0.0015426580976054538, + "clip_ratio/low_min": 0.0002890102396122529, + "clip_ratio/region_mean": 0.002698867676372174, + "epoch": 0.6532516768737241, + "grad_norm": 0.16260650753974915, + "learning_rate": 1e-06, + "loss": -0.0338, + "step": 280 + }, + { + "clip_ratio/high_max": 0.00219465876580216, + "clip_ratio/high_mean": 0.0007919146764834295, + "clip_ratio/low_mean": 0.0006715007675666129, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014634154649684206, + "completions/clipped_ratio": 0.0680803571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3350.0, + "completions/mean_length": 842.8627319335938, + "completions/mean_terminated_length": 605.2083740234375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.6555847185768445, + "grad_norm": 0.19941651821136475, + "learning_rate": 1e-06, + "loss": -0.0228, + "num_tokens": 43333839.0, + "reward": 0.5569196939468384, + "reward_std": 0.17528702318668365, + "rewards/verify_math_reward/mean": 0.5569196343421936, + "rewards/verify_math_reward/std": 0.4970270097255707, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0021507229139388073, + "clip_ratio/high_mean": 0.0008451492731182952, + "clip_ratio/low_mean": 0.0008469266977044754, + "clip_ratio/low_min": 1.4895138519932516e-05, + "clip_ratio/region_mean": 0.001692075948085403, + "epoch": 0.657917760279965, + "grad_norm": 0.16673003137111664, + "learning_rate": 1e-06, + "loss": -0.0229, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0024777984945103526, + "clip_ratio/high_mean": 0.0009478648280492052, + "clip_ratio/low_mean": 0.001053386182320537, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002001250999455806, + "epoch": 0.6602508019830855, + "grad_norm": 0.15395739674568176, + "learning_rate": 1e-06, + "loss": -0.0232, + "step": 283 + }, + { + "clip_ratio/high_max": 0.00210989813786, + "clip_ratio/high_mean": 0.0008852029659465188, + "clip_ratio/low_mean": 0.001249679880857002, + "clip_ratio/low_min": 1.4384349924512208e-05, + "clip_ratio/region_mean": 0.0021348828231566586, + "epoch": 0.6625838436862059, + "grad_norm": 0.14514604210853577, + "learning_rate": 1e-06, + "loss": -0.0232, + "step": 284 + }, + { + "clip_ratio/high_max": 0.001999265434278641, + "clip_ratio/high_mean": 0.000810526340501383, + "clip_ratio/low_mean": 0.0009376847829116741, + "clip_ratio/low_min": 6.594186834263382e-05, + "clip_ratio/region_mean": 0.0017482111034041736, + "completions/clipped_ratio": 0.0691964285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2510.0, + "completions/mean_length": 875.3549194335938, + "completions/mean_terminated_length": 635.9304809570312, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.6649168853893264, + "grad_norm": 0.22797183692455292, + "learning_rate": 1e-06, + "loss": -0.0146, + "num_tokens": 43961645.0, + "reward": 0.5212053656578064, + "reward_std": 0.2005843222141266, + "rewards/verify_math_reward/mean": 0.5212053656578064, + "rewards/verify_math_reward/std": 0.49982911348342896, + "step": 285 + }, + { + "clip_ratio/high_max": 0.002601971646072343, + "clip_ratio/high_mean": 0.000978758151177317, + "clip_ratio/low_mean": 0.0011493046149553265, + "clip_ratio/low_min": 0.00012364785106910858, + "clip_ratio/region_mean": 0.002128062733390834, + "epoch": 0.6672499270924468, + "grad_norm": 0.19644752144813538, + "learning_rate": 1e-06, + "loss": -0.0149, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0029370293123065494, + "clip_ratio/high_mean": 0.001147280399891315, + "clip_ratio/low_mean": 0.0013407869519141968, + "clip_ratio/low_min": 0.00013045562263869215, + "clip_ratio/region_mean": 0.0024880673008738086, + "epoch": 0.6695829687955672, + "grad_norm": 0.17700296640396118, + "learning_rate": 1e-06, + "loss": -0.0151, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0031012436520541087, + "clip_ratio/high_mean": 0.0011367060978955124, + "clip_ratio/low_mean": 0.0016410894750151783, + "clip_ratio/low_min": 0.00017486737488070503, + "clip_ratio/region_mean": 0.002777795525616966, + "epoch": 0.6719160104986877, + "grad_norm": 0.17286638915538788, + "learning_rate": 1e-06, + "loss": -0.0152, + "step": 288 + }, + { + "clip_ratio/high_max": 0.001724022728012642, + "clip_ratio/high_mean": 0.0006826344042565324, + "clip_ratio/low_mean": 0.0005667309815180488, + "clip_ratio/low_min": 2.887398932216456e-05, + "clip_ratio/region_mean": 0.0012493653921410441, + "completions/clipped_ratio": 0.0770089285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3297.0, + "completions/mean_length": 920.333740234375, + "completions/mean_terminated_length": 655.3748779296875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.6742490522018081, + "grad_norm": 0.17062070965766907, + "learning_rate": 1e-06, + "loss": -0.0542, + "num_tokens": 44600416.0, + "reward": 0.5580357313156128, + "reward_std": 0.16750861704349518, + "rewards/verify_math_reward/mean": 0.5580357313156128, + "rewards/verify_math_reward/std": 0.49689781665802, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0020607160186045803, + "clip_ratio/high_mean": 0.0008350994885404361, + "clip_ratio/low_mean": 0.0006770236104784999, + "clip_ratio/low_min": 1.2852148756792303e-05, + "clip_ratio/region_mean": 0.0015121231153898407, + "epoch": 0.6765820939049285, + "grad_norm": 0.154007688164711, + "learning_rate": 1e-06, + "loss": -0.0543, + "step": 290 + }, + { + "clip_ratio/high_max": 0.002040704872342758, + "clip_ratio/high_mean": 0.0009163299873762298, + "clip_ratio/low_mean": 0.0008831634622765705, + "clip_ratio/low_min": 3.047500467801001e-05, + "clip_ratio/region_mean": 0.0017994934823946096, + "epoch": 0.678915135608049, + "grad_norm": 0.1436816304922104, + "learning_rate": 1e-06, + "loss": -0.0544, + "step": 291 + }, + { + "clip_ratio/high_max": 0.002354840806219727, + "clip_ratio/high_mean": 0.0008853112758515636, + "clip_ratio/low_mean": 0.0009733925799082499, + "clip_ratio/low_min": 2.5704297513584606e-05, + "clip_ratio/region_mean": 0.0018587038503028452, + "epoch": 0.6812481773111695, + "grad_norm": 0.13852904736995697, + "learning_rate": 1e-06, + "loss": -0.0545, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0021498534333659336, + "clip_ratio/high_mean": 0.0008024014623515541, + "clip_ratio/low_mean": 0.0006454346130340127, + "clip_ratio/low_min": 2.5375558834639378e-05, + "clip_ratio/region_mean": 0.001447836053557694, + "completions/clipped_ratio": 0.0926339285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3623.0, + "completions/mean_length": 971.1328735351562, + "completions/mean_terminated_length": 652.1119384765625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.6835812190142899, + "grad_norm": 0.20537430047988892, + "learning_rate": 1e-06, + "loss": -0.0384, + "num_tokens": 45220287.0, + "reward": 0.5691964626312256, + "reward_std": 0.18303194642066956, + "rewards/verify_math_reward/mean": 0.5691964030265808, + "rewards/verify_math_reward/std": 0.4954652488231659, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0021460836433107033, + "clip_ratio/high_mean": 0.0008764559806877514, + "clip_ratio/low_mean": 0.0008536525929230265, + "clip_ratio/low_min": 8.626158432889497e-05, + "clip_ratio/region_mean": 0.0017301085536018945, + "epoch": 0.6859142607174104, + "grad_norm": 0.18146011233329773, + "learning_rate": 1e-06, + "loss": -0.0386, + "step": 294 + }, + { + "clip_ratio/high_max": 0.002821837035298813, + "clip_ratio/high_mean": 0.0010902132889896166, + "clip_ratio/low_mean": 0.0010684805311029777, + "clip_ratio/low_min": 5.1454062486300245e-05, + "clip_ratio/region_mean": 0.0021586938310065307, + "epoch": 0.6882473024205308, + "grad_norm": 0.17304526269435883, + "learning_rate": 1e-06, + "loss": -0.0388, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0025890369288390502, + "clip_ratio/high_mean": 0.0010107452362717595, + "clip_ratio/low_mean": 0.0011576586766750552, + "clip_ratio/low_min": 7.048963198030833e-05, + "clip_ratio/region_mean": 0.0021684039020328782, + "epoch": 0.6905803441236512, + "grad_norm": 0.15993866324424744, + "learning_rate": 1e-06, + "loss": -0.0389, + "step": 296 + }, + { + "clip_ratio/high_max": 0.002087494191073347, + "clip_ratio/high_mean": 0.0007458662621502299, + "clip_ratio/low_mean": 0.0006469466452472261, + "clip_ratio/low_min": 2.270076856802916e-05, + "clip_ratio/region_mean": 0.0013928129119449295, + "completions/clipped_ratio": 0.0636160714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3791.0, + "completions/mean_length": 866.2857666015625, + "completions/mean_terminated_length": 646.8652954101562, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.6929133858267716, + "grad_norm": 0.19377551972866058, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 45848071.0, + "reward": 0.5390625, + "reward_std": 0.16830208897590637, + "rewards/verify_math_reward/mean": 0.5390625, + "rewards/verify_math_reward/std": 0.4987502098083496, + "step": 297 + }, + { + "clip_ratio/high_max": 0.002341042520129122, + "clip_ratio/high_mean": 0.0008734871789783938, + "clip_ratio/low_mean": 0.0007677624307689257, + "clip_ratio/low_min": 3.7830393011972774e-05, + "clip_ratio/region_mean": 0.0016412495970143937, + "epoch": 0.6952464275298921, + "grad_norm": 0.18440525233745575, + "learning_rate": 1e-06, + "loss": -0.0056, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0025020054017659277, + "clip_ratio/high_mean": 0.000964397990173893, + "clip_ratio/low_mean": 0.0009638205920055043, + "clip_ratio/low_min": 1.9797276763711125e-05, + "clip_ratio/region_mean": 0.0019282185603515245, + "epoch": 0.6975794692330125, + "grad_norm": 0.1549079865217209, + "learning_rate": 1e-06, + "loss": -0.0058, + "step": 299 + }, + { + "clip_ratio/high_max": 0.002495607521268539, + "clip_ratio/high_mean": 0.000916457383937086, + "clip_ratio/low_mean": 0.0011305743719276506, + "clip_ratio/low_min": 3.956953059969237e-05, + "clip_ratio/region_mean": 0.002047031703114044, + "epoch": 0.6999125109361329, + "grad_norm": 0.15340441465377808, + "learning_rate": 1e-06, + "loss": -0.0059, + "step": 300 + }, + { + "clip_ratio/high_max": 0.002479487993696239, + "clip_ratio/high_mean": 0.0010109548456966877, + "clip_ratio/low_mean": 0.0007489804047509097, + "clip_ratio/low_min": 8.222711767302826e-05, + "clip_ratio/region_mean": 0.0017599352431716397, + "completions/clipped_ratio": 0.1004464285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2059.0, + "completions/mean_length": 1006.3047485351562, + "completions/mean_terminated_length": 661.301513671875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.7022455526392535, + "grad_norm": 0.21473151445388794, + "learning_rate": 1e-06, + "loss": -0.0478, + "num_tokens": 46474320.0, + "reward": 0.543526828289032, + "reward_std": 0.2144775092601776, + "rewards/verify_math_reward/mean": 0.5435267686843872, + "rewards/verify_math_reward/std": 0.49838000535964966, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0028480310429586098, + "clip_ratio/high_mean": 0.0011781237517425325, + "clip_ratio/low_mean": 0.0008835742810333613, + "clip_ratio/low_min": 9.115397187997587e-05, + "clip_ratio/region_mean": 0.002061698047327809, + "epoch": 0.7045785943423739, + "grad_norm": 0.19146059453487396, + "learning_rate": 1e-06, + "loss": -0.048, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0030311847804114223, + "clip_ratio/high_mean": 0.0012647803814616054, + "clip_ratio/low_mean": 0.0011280409526079893, + "clip_ratio/low_min": 6.366150228132028e-05, + "clip_ratio/region_mean": 0.0023928213558974676, + "epoch": 0.7069116360454943, + "grad_norm": 0.18539467453956604, + "learning_rate": 1e-06, + "loss": -0.0482, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0030102770251687616, + "clip_ratio/high_mean": 0.001217226883454714, + "clip_ratio/low_mean": 0.0013476571002684068, + "clip_ratio/low_min": 0.00012812625027436297, + "clip_ratio/region_mean": 0.002564883994637057, + "epoch": 0.7092446777486148, + "grad_norm": 0.18372808396816254, + "learning_rate": 1e-06, + "loss": -0.0483, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0018803218481480144, + "clip_ratio/high_mean": 0.00078264167495945, + "clip_ratio/low_mean": 0.0006800834380555898, + "clip_ratio/low_min": 1.233471448358614e-05, + "clip_ratio/region_mean": 0.0014627251330239233, + "completions/clipped_ratio": 0.0848214285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2283.0, + "completions/mean_length": 905.3359985351562, + "completions/mean_terminated_length": 609.6158447265625, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.7115777194517352, + "grad_norm": 0.2239924669265747, + "learning_rate": 1e-06, + "loss": -0.0251, + "num_tokens": 47065829.0, + "reward": 0.5714285969734192, + "reward_std": 0.1870116889476776, + "rewards/verify_math_reward/mean": 0.5714285969734192, + "rewards/verify_math_reward/std": 0.49514803290367126, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0026517381411395036, + "clip_ratio/high_mean": 0.0011003393010469154, + "clip_ratio/low_mean": 0.0010205757225776324, + "clip_ratio/low_min": 7.395359716610983e-05, + "clip_ratio/region_mean": 0.002120915065461304, + "epoch": 0.7139107611548556, + "grad_norm": 0.2101132869720459, + "learning_rate": 1e-06, + "loss": -0.0254, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0028270179318496957, + "clip_ratio/high_mean": 0.0011196137056685984, + "clip_ratio/low_mean": 0.0012128089510952123, + "clip_ratio/low_min": 2.466942896717228e-05, + "clip_ratio/region_mean": 0.0023324227513512596, + "epoch": 0.7162438028579761, + "grad_norm": 0.18428486585617065, + "learning_rate": 1e-06, + "loss": -0.0256, + "step": 307 + }, + { + "clip_ratio/high_max": 0.002363015402806923, + "clip_ratio/high_mean": 0.0010658901901479112, + "clip_ratio/low_mean": 0.0014643462345702574, + "clip_ratio/low_min": 0.00010415140059194528, + "clip_ratio/region_mean": 0.002530236371967476, + "epoch": 0.7185768445610965, + "grad_norm": 0.17178674042224884, + "learning_rate": 1e-06, + "loss": -0.0257, + "step": 308 + }, + { + "clip_ratio/high_max": 0.002385223771852907, + "clip_ratio/high_mean": 0.0009904428261506837, + "clip_ratio/low_mean": 0.0005621537156912382, + "clip_ratio/low_min": 2.756753019639291e-05, + "clip_ratio/region_mean": 0.0015525966045970563, + "completions/clipped_ratio": 0.0848214285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2489.0, + "completions/mean_length": 897.09716796875, + "completions/mean_terminated_length": 600.6134033203125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.7209098862642169, + "grad_norm": 0.19375817477703094, + "learning_rate": 1e-06, + "loss": -0.0374, + "num_tokens": 47644244.0, + "reward": 0.6149553656578064, + "reward_std": 0.1734083741903305, + "rewards/verify_math_reward/mean": 0.6149553656578064, + "rewards/verify_math_reward/std": 0.4868776500225067, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0027656317324726842, + "clip_ratio/high_mean": 0.0011467549702501856, + "clip_ratio/low_mean": 0.0006837193341198144, + "clip_ratio/low_min": 1.0453252798470203e-05, + "clip_ratio/region_mean": 0.0018304742843611166, + "epoch": 0.7232429279673375, + "grad_norm": 0.1754893809556961, + "learning_rate": 1e-06, + "loss": -0.0375, + "step": 310 + }, + { + "clip_ratio/high_max": 0.002889984578359872, + "clip_ratio/high_mean": 0.0012613694634637795, + "clip_ratio/low_mean": 0.0009629708802094683, + "clip_ratio/low_min": 3.538256623869529e-05, + "clip_ratio/region_mean": 0.002224340358225163, + "epoch": 0.7255759696704579, + "grad_norm": 0.16185636818408966, + "learning_rate": 1e-06, + "loss": -0.0377, + "step": 311 + }, + { + "clip_ratio/high_max": 0.002871236007194966, + "clip_ratio/high_mean": 0.0012253369859536178, + "clip_ratio/low_mean": 0.001014525489154039, + "clip_ratio/low_min": 1.799078927433584e-05, + "clip_ratio/region_mean": 0.0022398624787456356, + "epoch": 0.7279090113735783, + "grad_norm": 0.17588123679161072, + "learning_rate": 1e-06, + "loss": -0.0378, + "step": 312 + }, + { + "clip_ratio/high_max": 0.002086147724185139, + "clip_ratio/high_mean": 0.0008469103850075044, + "clip_ratio/low_mean": 0.0008083435477601597, + "clip_ratio/low_min": 1.1007397006324027e-05, + "clip_ratio/region_mean": 0.0016552538945688866, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3981.0, + "completions/mean_length": 885.739990234375, + "completions/mean_terminated_length": 613.6840209960938, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 0.7302420530766988, + "grad_norm": 0.23492024838924408, + "learning_rate": 1e-06, + "loss": -0.0367, + "num_tokens": 48240235.0, + "reward": 0.5870535969734192, + "reward_std": 0.20587676763534546, + "rewards/verify_math_reward/mean": 0.5870535969734192, + "rewards/verify_math_reward/std": 0.49263837933540344, + "step": 313 + }, + { + "clip_ratio/high_max": 0.002379067002038937, + "clip_ratio/high_mean": 0.0010538723690842744, + "clip_ratio/low_mean": 0.0010255752076773206, + "clip_ratio/low_min": 5.036196489527356e-05, + "clip_ratio/region_mean": 0.0020794475931324996, + "epoch": 0.7325750947798192, + "grad_norm": 0.20596159994602203, + "learning_rate": 1e-06, + "loss": -0.0369, + "step": 314 + }, + { + "clip_ratio/high_max": 0.002566054739872925, + "clip_ratio/high_mean": 0.0010753861861303449, + "clip_ratio/low_mean": 0.0012584659052663483, + "clip_ratio/low_min": 1.940692527568899e-05, + "clip_ratio/region_mean": 0.0023338521641562693, + "epoch": 0.7349081364829396, + "grad_norm": 0.18503864109516144, + "learning_rate": 1e-06, + "loss": -0.0371, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0028267628003959544, + "clip_ratio/high_mean": 0.0011631867128016893, + "clip_ratio/low_mean": 0.001585173016792396, + "clip_ratio/low_min": 8.646383503219113e-05, + "clip_ratio/region_mean": 0.0027483597295940854, + "epoch": 0.73724117818606, + "grad_norm": 0.17676009237766266, + "learning_rate": 1e-06, + "loss": -0.0373, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0020308344828663394, + "clip_ratio/high_mean": 0.0008841200924507575, + "clip_ratio/low_mean": 0.0006222654037628672, + "clip_ratio/low_min": 3.280446890130406e-05, + "clip_ratio/region_mean": 0.001506385437096469, + "completions/clipped_ratio": 0.0725446428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2776.0, + "completions/mean_length": 837.7857666015625, + "completions/mean_terminated_length": 582.931396484375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.7395742198891805, + "grad_norm": 0.2314973920583725, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 48814699.0, + "reward": 0.613839328289032, + "reward_std": 0.19084171950817108, + "rewards/verify_math_reward/mean": 0.6138392686843872, + "rewards/verify_math_reward/std": 0.48714008927345276, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0025161904195556417, + "clip_ratio/high_mean": 0.0010388361515651923, + "clip_ratio/low_mean": 0.0009122929877776187, + "clip_ratio/low_min": 7.236033252411289e-05, + "clip_ratio/region_mean": 0.0019511291611706838, + "epoch": 0.7419072615923009, + "grad_norm": 0.20519977807998657, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0024081753472273704, + "clip_ratio/high_mean": 0.0009931490312737878, + "clip_ratio/low_mean": 0.001269341999432072, + "clip_ratio/low_min": 8.174411141226301e-05, + "clip_ratio/region_mean": 0.0022624911143793724, + "epoch": 0.7442403032954215, + "grad_norm": 0.1907121241092682, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0027625623624771833, + "clip_ratio/high_mean": 0.001064769570803037, + "clip_ratio/low_mean": 0.0013623945651488611, + "clip_ratio/low_min": 7.313470996450633e-05, + "clip_ratio/region_mean": 0.002427164145046845, + "epoch": 0.7465733449985419, + "grad_norm": 0.18573161959648132, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0020261932440917008, + "clip_ratio/high_mean": 0.0008740671855775872, + "clip_ratio/low_mean": 0.0007327803323278204, + "clip_ratio/low_min": 2.4659695554873906e-05, + "clip_ratio/region_mean": 0.001606847537914291, + "completions/clipped_ratio": 0.0959821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2926.0, + "completions/mean_length": 984.7779541015625, + "completions/mean_terminated_length": 654.4506225585938, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.7489063867016623, + "grad_norm": 0.22532038390636444, + "learning_rate": 1e-06, + "loss": -0.02, + "num_tokens": 49440620.0, + "reward": 0.5613839626312256, + "reward_std": 0.18416057527065277, + "rewards/verify_math_reward/mean": 0.5613839030265808, + "rewards/verify_math_reward/std": 0.496494859457016, + "step": 321 + }, + { + "clip_ratio/high_max": 0.002825237992510665, + "clip_ratio/high_mean": 0.0011012507529812865, + "clip_ratio/low_mean": 0.0009591566486051306, + "clip_ratio/low_min": 1.2329847777436953e-05, + "clip_ratio/region_mean": 0.002060407350654714, + "epoch": 0.7512394284047827, + "grad_norm": 0.2141539305448532, + "learning_rate": 1e-06, + "loss": -0.0202, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0029633525336976163, + "clip_ratio/high_mean": 0.0011840704191854456, + "clip_ratio/low_mean": 0.001118860669521382, + "clip_ratio/low_min": 4.405286381370388e-05, + "clip_ratio/region_mean": 0.002302931126905605, + "epoch": 0.7535724701079032, + "grad_norm": 0.17943425476551056, + "learning_rate": 1e-06, + "loss": -0.0204, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0028357562187011354, + "clip_ratio/high_mean": 0.001128319876443129, + "clip_ratio/low_mean": 0.001372187805827707, + "clip_ratio/low_min": 7.397909212158993e-05, + "clip_ratio/region_mean": 0.0025005076968227513, + "epoch": 0.7559055118110236, + "grad_norm": 0.16282466053962708, + "learning_rate": 1e-06, + "loss": -0.0205, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0017659160221228376, + "clip_ratio/high_mean": 0.0007654340861336095, + "clip_ratio/low_mean": 0.0006649163590282114, + "clip_ratio/low_min": 1.1312216884107329e-05, + "clip_ratio/region_mean": 0.001430350461305352, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3992.0, + "completions/mean_length": 940.239990234375, + "completions/mean_terminated_length": 643.5446166992188, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.758238553514144, + "grad_norm": 0.21520856022834778, + "learning_rate": 1e-06, + "loss": -0.035, + "num_tokens": 50061851.0, + "reward": 0.5424107313156128, + "reward_std": 0.18437741696834564, + "rewards/verify_math_reward/mean": 0.5424107313156128, + "rewards/verify_math_reward/std": 0.4984763562679291, + "step": 325 + }, + { + "clip_ratio/high_max": 0.002330236129637342, + "clip_ratio/high_mean": 0.0009415907279617386, + "clip_ratio/low_mean": 0.000835203873066348, + "clip_ratio/low_min": 3.393665247131139e-05, + "clip_ratio/region_mean": 0.001776794575562235, + "epoch": 0.7605715952172645, + "grad_norm": 0.1842113435268402, + "learning_rate": 1e-06, + "loss": -0.0353, + "step": 326 + }, + { + "clip_ratio/high_max": 0.00230511437257519, + "clip_ratio/high_mean": 0.001003508476060233, + "clip_ratio/low_mean": 0.0010929000836767955, + "clip_ratio/low_min": 1.1312216884107329e-05, + "clip_ratio/region_mean": 0.0020964085560990497, + "epoch": 0.7629046369203849, + "grad_norm": 0.16316959261894226, + "learning_rate": 1e-06, + "loss": -0.0354, + "step": 327 + }, + { + "clip_ratio/high_max": 0.002567019429989159, + "clip_ratio/high_mean": 0.000990610795270186, + "clip_ratio/low_mean": 0.0012704540695267497, + "clip_ratio/low_min": 9.153485734714195e-06, + "clip_ratio/region_mean": 0.0022610649466514587, + "epoch": 0.7652376786235054, + "grad_norm": 0.204722598195076, + "learning_rate": 1e-06, + "loss": -0.0355, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0017108206229750067, + "clip_ratio/high_mean": 0.0006872556414236897, + "clip_ratio/low_mean": 0.0005310801495852502, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012183357739559142, + "completions/clipped_ratio": 0.0691964285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2780.0, + "completions/mean_length": 852.4699096679688, + "completions/mean_terminated_length": 611.3441162109375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.7675707203266259, + "grad_norm": 0.21647314727306366, + "learning_rate": 1e-06, + "loss": -0.026, + "num_tokens": 50656928.0, + "reward": 0.6395089626312256, + "reward_std": 0.15737581253051758, + "rewards/verify_math_reward/mean": 0.6395089030265808, + "rewards/verify_math_reward/std": 0.4804111421108246, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0020555987684929278, + "clip_ratio/high_mean": 0.0007949248674776754, + "clip_ratio/low_mean": 0.000770878323010038, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015658032170904335, + "epoch": 0.7699037620297463, + "grad_norm": 0.1775524616241455, + "learning_rate": 1e-06, + "loss": -0.0262, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0025171050292556174, + "clip_ratio/high_mean": 0.0009998566874855896, + "clip_ratio/low_mean": 0.0008616442519269185, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018615009503264446, + "epoch": 0.7722368037328667, + "grad_norm": 0.16343437135219574, + "learning_rate": 1e-06, + "loss": -0.0264, + "step": 331 + }, + { + "clip_ratio/high_max": 0.002173786866478622, + "clip_ratio/high_mean": 0.0009114055246755015, + "clip_ratio/low_mean": 0.0010367860127189488, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019481915842334274, + "epoch": 0.7745698454359872, + "grad_norm": 0.15880723297595978, + "learning_rate": 1e-06, + "loss": -0.0264, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0017265135975321755, + "clip_ratio/high_mean": 0.0005917233247600961, + "clip_ratio/low_mean": 0.0006106325026848936, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001202355808345601, + "completions/clipped_ratio": 0.1227678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3840.0, + "completions/mean_length": 1066.146240234375, + "completions/mean_terminated_length": 642.120849609375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.7769028871391076, + "grad_norm": 0.21014513075351715, + "learning_rate": 1e-06, + "loss": -0.0184, + "num_tokens": 51257251.0, + "reward": 0.504464328289032, + "reward_std": 0.1623242348432541, + "rewards/verify_math_reward/mean": 0.5044642686843872, + "rewards/verify_math_reward/std": 0.5002593398094177, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0021170642612560187, + "clip_ratio/high_mean": 0.0007432475922541926, + "clip_ratio/low_mean": 0.0008848217221384402, + "clip_ratio/low_min": 1.387963584420504e-05, + "clip_ratio/region_mean": 0.0016280692943837494, + "epoch": 0.779235928842228, + "grad_norm": 0.18322604894638062, + "learning_rate": 1e-06, + "loss": -0.0186, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0023809336016711313, + "clip_ratio/high_mean": 0.0008279726753244177, + "clip_ratio/low_mean": 0.0009638046303734882, + "clip_ratio/low_min": 1.4501159967039712e-05, + "clip_ratio/region_mean": 0.00179177729296498, + "epoch": 0.7815689705453485, + "grad_norm": 0.1649850606918335, + "learning_rate": 1e-06, + "loss": -0.0188, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0023388637600874063, + "clip_ratio/high_mean": 0.0008620175885880599, + "clip_ratio/low_mean": 0.0011599286262935493, + "clip_ratio/low_min": 5.6761591622489505e-05, + "clip_ratio/region_mean": 0.0020219462167005986, + "epoch": 0.7839020122484689, + "grad_norm": 0.1583496779203415, + "learning_rate": 1e-06, + "loss": -0.0188, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0019701817764143925, + "clip_ratio/high_mean": 0.0008163482880263473, + "clip_ratio/low_mean": 0.0004907682782686607, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013071165485598613, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3998.0, + "completions/mean_length": 972.9855346679688, + "completions/mean_terminated_length": 619.9490356445312, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.7862350539515894, + "grad_norm": 0.2171715795993805, + "learning_rate": 1e-06, + "loss": -0.0345, + "num_tokens": 51846038.0, + "reward": 0.5714285969734192, + "reward_std": 0.15593409538269043, + "rewards/verify_math_reward/mean": 0.5714285969734192, + "rewards/verify_math_reward/std": 0.49514803290367126, + "step": 337 + }, + { + "clip_ratio/high_max": 0.002572505996795371, + "clip_ratio/high_mean": 0.0009753500671649817, + "clip_ratio/low_mean": 0.000732240172510501, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017075901851058006, + "epoch": 0.7885680956547099, + "grad_norm": 0.20398429036140442, + "learning_rate": 1e-06, + "loss": -0.0347, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0028881516482215375, + "clip_ratio/high_mean": 0.001134720796471811, + "clip_ratio/low_mean": 0.0007902334382379195, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019249542820034549, + "epoch": 0.7909011373578303, + "grad_norm": 0.16836610436439514, + "learning_rate": 1e-06, + "loss": -0.0349, + "step": 339 + }, + { + "clip_ratio/high_max": 0.002433148052659817, + "clip_ratio/high_mean": 0.0010081968639497063, + "clip_ratio/low_mean": 0.001013724282529438, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002021921107370872, + "epoch": 0.7932341790609507, + "grad_norm": 0.15733319520950317, + "learning_rate": 1e-06, + "loss": -0.035, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0019665661711769644, + "clip_ratio/high_mean": 0.0006874319560665754, + "clip_ratio/low_mean": 0.0005987316217215266, + "clip_ratio/low_min": 5.924893321207492e-05, + "clip_ratio/region_mean": 0.001286163580516586, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3553.0, + "completions/mean_length": 863.029052734375, + "completions/mean_terminated_length": 618.5186157226562, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.7955672207640712, + "grad_norm": 0.22709819674491882, + "learning_rate": 1e-06, + "loss": -0.0126, + "num_tokens": 52448128.0, + "reward": 0.5625, + "reward_std": 0.15225742757320404, + "rewards/verify_math_reward/mean": 0.5625, + "rewards/verify_math_reward/std": 0.49635544419288635, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0019139994692523032, + "clip_ratio/high_mean": 0.0007924420715426095, + "clip_ratio/low_mean": 0.0008447009604424238, + "clip_ratio/low_min": 0.00012417925518093398, + "clip_ratio/region_mean": 0.0016371430319850333, + "epoch": 0.7979002624671916, + "grad_norm": 0.17402707040309906, + "learning_rate": 1e-06, + "loss": -0.0129, + "step": 342 + }, + { + "clip_ratio/high_max": 0.00228307512588799, + "clip_ratio/high_mean": 0.0007988824791027582, + "clip_ratio/low_mean": 0.0009502735165369813, + "clip_ratio/low_min": 0.00010351143191655865, + "clip_ratio/region_mean": 0.0017491560320195276, + "epoch": 0.800233304170312, + "grad_norm": 0.1693955808877945, + "learning_rate": 1e-06, + "loss": -0.013, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0020274076487112325, + "clip_ratio/high_mean": 0.0008058251578404452, + "clip_ratio/low_mean": 0.0012059471646352904, + "clip_ratio/low_min": 0.0001847325765993446, + "clip_ratio/region_mean": 0.002011772339756135, + "epoch": 0.8025663458734325, + "grad_norm": 0.15824578702449799, + "learning_rate": 1e-06, + "loss": -0.0131, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0024287665510200895, + "clip_ratio/high_mean": 0.0010701709761633538, + "clip_ratio/low_mean": 0.0005244867297733435, + "clip_ratio/low_min": 4.60348637716379e-05, + "clip_ratio/region_mean": 0.0015946576968417503, + "completions/clipped_ratio": 0.0658482142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2066.0, + "completions/mean_length": 801.6574096679688, + "completions/mean_terminated_length": 569.4396362304688, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.8048993875765529, + "grad_norm": 0.25556159019470215, + "learning_rate": 1e-06, + "loss": -0.0212, + "num_tokens": 53005413.0, + "reward": 0.6383928656578064, + "reward_std": 0.18126347661018372, + "rewards/verify_math_reward/mean": 0.6383928656578064, + "rewards/verify_math_reward/std": 0.4807341694831848, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0026033492176793516, + "clip_ratio/high_mean": 0.0011321816746203694, + "clip_ratio/low_mean": 0.0007589621236547828, + "clip_ratio/low_min": 1.8463810192770325e-05, + "clip_ratio/region_mean": 0.0018911437728093006, + "epoch": 0.8072324292796734, + "grad_norm": 0.2019054889678955, + "learning_rate": 1e-06, + "loss": -0.0214, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0024528612120775506, + "clip_ratio/high_mean": 0.0010744591600087006, + "clip_ratio/low_mean": 0.0009249952199752443, + "clip_ratio/low_min": 5.539143239730038e-05, + "clip_ratio/region_mean": 0.001999454398173839, + "epoch": 0.8095654709827939, + "grad_norm": 0.18154841661453247, + "learning_rate": 1e-06, + "loss": -0.0215, + "step": 347 + }, + { + "clip_ratio/high_max": 0.002686970961804036, + "clip_ratio/high_mean": 0.0011972458305535838, + "clip_ratio/low_mean": 0.0011541733802005183, + "clip_ratio/low_min": 0.00010741467849584296, + "clip_ratio/region_mean": 0.002351419192564208, + "epoch": 0.8118985126859143, + "grad_norm": 0.179201140999794, + "learning_rate": 1e-06, + "loss": -0.0217, + "step": 348 + }, + { + "clip_ratio/high_max": 0.002198454505560221, + "clip_ratio/high_mean": 0.0009481786855758401, + "clip_ratio/low_mean": 0.0006049338808225002, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015531125500274356, + "completions/clipped_ratio": 0.0825892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3831.0, + "completions/mean_length": 877.8605346679688, + "completions/mean_terminated_length": 588.149658203125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.8142315543890347, + "grad_norm": 0.22538886964321136, + "learning_rate": 1e-06, + "loss": -0.0396, + "num_tokens": 53568944.0, + "reward": 0.6573660969734192, + "reward_std": 0.1759275496006012, + "rewards/verify_math_reward/mean": 0.6573660969734192, + "rewards/verify_math_reward/std": 0.47485533356666565, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0029960423926240765, + "clip_ratio/high_mean": 0.001205331936944276, + "clip_ratio/low_mean": 0.0009345814778498607, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021399134275270626, + "epoch": 0.8165645960921551, + "grad_norm": 0.25057047605514526, + "learning_rate": 1e-06, + "loss": -0.0398, + "step": 350 + }, + { + "clip_ratio/high_max": 0.002933300071163103, + "clip_ratio/high_mean": 0.0012073634934495203, + "clip_ratio/low_mean": 0.00103660965396557, + "clip_ratio/low_min": 2.9634898965014145e-05, + "clip_ratio/region_mean": 0.0022439731619670056, + "epoch": 0.8188976377952756, + "grad_norm": 0.18622159957885742, + "learning_rate": 1e-06, + "loss": -0.04, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0032266734924633056, + "clip_ratio/high_mean": 0.0013009330614295322, + "clip_ratio/low_mean": 0.0013344781218620483, + "clip_ratio/low_min": 2.133105772372801e-05, + "clip_ratio/region_mean": 0.002635411190567538, + "epoch": 0.821230679498396, + "grad_norm": 0.17074429988861084, + "learning_rate": 1e-06, + "loss": -0.0402, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0017971512352232821, + "clip_ratio/high_mean": 0.0007026633556961315, + "clip_ratio/low_mean": 0.0006032489382050699, + "clip_ratio/low_min": 1.3160665730538312e-05, + "clip_ratio/region_mean": 0.0013059123120910954, + "completions/clipped_ratio": 0.0892857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3867.0, + "completions/mean_length": 921.6998291015625, + "completions/mean_terminated_length": 610.493896484375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.8235637212015164, + "grad_norm": 0.2515932023525238, + "learning_rate": 1e-06, + "loss": -0.0245, + "num_tokens": 54168955.0, + "reward": 0.5424107313156128, + "reward_std": 0.15988317131996155, + "rewards/verify_math_reward/mean": 0.5424107313156128, + "rewards/verify_math_reward/std": 0.4984763562679291, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0021717835843446665, + "clip_ratio/high_mean": 0.0007233772030303953, + "clip_ratio/low_mean": 0.0008419789210165618, + "clip_ratio/low_min": 1.4545031262969133e-05, + "clip_ratio/region_mean": 0.0015653561240469571, + "epoch": 0.8258967629046369, + "grad_norm": 0.1717970371246338, + "learning_rate": 1e-06, + "loss": -0.0247, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0023595982711412944, + "clip_ratio/high_mean": 0.000842728490169975, + "clip_ratio/low_mean": 0.0009849208618106786, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018276493137818761, + "epoch": 0.8282298046077574, + "grad_norm": 0.1771852672100067, + "learning_rate": 1e-06, + "loss": -0.0248, + "step": 355 + }, + { + "clip_ratio/high_max": 0.00218829129516962, + "clip_ratio/high_mean": 0.0008186861086869612, + "clip_ratio/low_mean": 0.0012674419085669797, + "clip_ratio/low_min": 1.3160665730538312e-05, + "clip_ratio/region_mean": 0.0020861279699602164, + "epoch": 0.8305628463108778, + "grad_norm": 0.15224948525428772, + "learning_rate": 1e-06, + "loss": -0.0249, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0019288021903776098, + "clip_ratio/high_mean": 0.0007495696963815135, + "clip_ratio/low_mean": 0.00046653347021674563, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012161031409050338, + "completions/clipped_ratio": 0.0870535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3902.0, + "completions/mean_length": 918.7522583007812, + "completions/mean_terminated_length": 615.7872924804688, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.8328958880139983, + "grad_norm": 0.21693141758441925, + "learning_rate": 1e-06, + "loss": -0.0144, + "num_tokens": 54760221.0, + "reward": 0.5491071939468384, + "reward_std": 0.1401536762714386, + "rewards/verify_math_reward/mean": 0.5491071343421936, + "rewards/verify_math_reward/std": 0.49786055088043213, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0026613782247295603, + "clip_ratio/high_mean": 0.0009070894393516937, + "clip_ratio/low_mean": 0.0006486262627731776, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015557157057628501, + "epoch": 0.8352289297171187, + "grad_norm": 0.179405078291893, + "learning_rate": 1e-06, + "loss": -0.0146, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0024907990446081385, + "clip_ratio/high_mean": 0.0009263725514756516, + "clip_ratio/low_mean": 0.0008601827576057985, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017865552872535773, + "epoch": 0.8375619714202391, + "grad_norm": 0.15576620399951935, + "learning_rate": 1e-06, + "loss": -0.0148, + "step": 359 + }, + { + "clip_ratio/high_max": 0.002331049137865193, + "clip_ratio/high_mean": 0.0008176261253538541, + "clip_ratio/low_mean": 0.0009222162534570089, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017398423842678312, + "epoch": 0.8398950131233596, + "grad_norm": 0.18607597053050995, + "learning_rate": 1e-06, + "loss": -0.0148, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0020165567293588538, + "clip_ratio/high_mean": 0.000853951276440057, + "clip_ratio/low_mean": 0.0006930507770448457, + "clip_ratio/low_min": 2.6749410608317703e-05, + "clip_ratio/region_mean": 0.0015470020662178285, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3452.0, + "completions/mean_length": 865.521240234375, + "completions/mean_terminated_length": 591.7518310546875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.84222805482648, + "grad_norm": 0.24153633415699005, + "learning_rate": 1e-06, + "loss": -0.0373, + "num_tokens": 55336968.0, + "reward": 0.5915178656578064, + "reward_std": 0.1732928454875946, + "rewards/verify_math_reward/mean": 0.5915178656578064, + "rewards/verify_math_reward/std": 0.49182769656181335, + "step": 361 + }, + { + "clip_ratio/high_max": 0.002464430181134958, + "clip_ratio/high_mean": 0.0010238850773021113, + "clip_ratio/low_mean": 0.0009376328980579274, + "clip_ratio/low_min": 2.2498199541587383e-05, + "clip_ratio/region_mean": 0.0019615179335232824, + "epoch": 0.8445610965296004, + "grad_norm": 0.20365145802497864, + "learning_rate": 1e-06, + "loss": -0.0376, + "step": 362 + }, + { + "clip_ratio/high_max": 0.002556008934334386, + "clip_ratio/high_mean": 0.0010945256326522212, + "clip_ratio/low_mean": 0.001119626198487822, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002214151834778022, + "epoch": 0.8468941382327209, + "grad_norm": 0.18342220783233643, + "learning_rate": 1e-06, + "loss": -0.0377, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0025188724284817, + "clip_ratio/high_mean": 0.0009661511594458716, + "clip_ratio/low_mean": 0.0013099270981911104, + "clip_ratio/low_min": 2.2123893359093927e-05, + "clip_ratio/region_mean": 0.002276078288559802, + "epoch": 0.8492271799358414, + "grad_norm": 0.183487206697464, + "learning_rate": 1e-06, + "loss": -0.0378, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0023666196648264304, + "clip_ratio/high_mean": 0.000941422698815586, + "clip_ratio/low_mean": 0.0005456536673591472, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014870763588987757, + "completions/clipped_ratio": 0.0725446428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3743.0, + "completions/mean_length": 881.25341796875, + "completions/mean_terminated_length": 629.799072265625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 0.8515602216389618, + "grad_norm": 0.21291105449199677, + "learning_rate": 1e-06, + "loss": -0.0398, + "num_tokens": 55947411.0, + "reward": 0.6417410969734192, + "reward_std": 0.16958087682724, + "rewards/verify_math_reward/mean": 0.6417410969734192, + "rewards/verify_math_reward/std": 0.47975653409957886, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0023889943404356018, + "clip_ratio/high_mean": 0.0010003773859352805, + "clip_ratio/low_mean": 0.0007598972979394603, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001760274768457748, + "epoch": 0.8538932633420823, + "grad_norm": 0.18001306056976318, + "learning_rate": 1e-06, + "loss": -0.0401, + "step": 366 + }, + { + "clip_ratio/high_max": 0.002810595411574468, + "clip_ratio/high_mean": 0.0011152851329825353, + "clip_ratio/low_mean": 0.0008203240631701192, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019356091434019618, + "epoch": 0.8562263050452027, + "grad_norm": 0.16618499159812927, + "learning_rate": 1e-06, + "loss": -0.0402, + "step": 367 + }, + { + "clip_ratio/high_max": 0.002615925644931849, + "clip_ratio/high_mean": 0.0010511315558687784, + "clip_ratio/low_mean": 0.0010743256916612154, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021254572566249408, + "epoch": 0.8585593467483231, + "grad_norm": 0.1623554676771164, + "learning_rate": 1e-06, + "loss": -0.0403, + "step": 368 + }, + { + "clip_ratio/high_max": 0.002231936941825552, + "clip_ratio/high_mean": 0.0008358096129086334, + "clip_ratio/low_mean": 0.0006096504575907602, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014454600895987824, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3463.0, + "completions/mean_length": 927.8984985351562, + "completions/mean_terminated_length": 600.1637573242188, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.8608923884514436, + "grad_norm": 0.2518787682056427, + "learning_rate": 1e-06, + "loss": -0.0199, + "num_tokens": 56532280.0, + "reward": 0.5680803656578064, + "reward_std": 0.18975545465946198, + "rewards/verify_math_reward/mean": 0.5680803656578064, + "rewards/verify_math_reward/std": 0.4956200420856476, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0022443662091973238, + "clip_ratio/high_mean": 0.0009726381431391928, + "clip_ratio/low_mean": 0.0009172410991595825, + "clip_ratio/low_min": 2.8811271477025002e-05, + "clip_ratio/region_mean": 0.0018898792513937224, + "epoch": 0.863225430154564, + "grad_norm": 0.21699672937393188, + "learning_rate": 1e-06, + "loss": -0.0203, + "step": 370 + }, + { + "clip_ratio/high_max": 0.002635184835526161, + "clip_ratio/high_mean": 0.0010936449471046217, + "clip_ratio/low_mean": 0.0010467112697369885, + "clip_ratio/low_min": 3.5840941563947126e-05, + "clip_ratio/region_mean": 0.00214035621684161, + "epoch": 0.8655584718576844, + "grad_norm": 0.18343502283096313, + "learning_rate": 1e-06, + "loss": -0.0204, + "step": 371 + }, + { + "clip_ratio/high_max": 0.002463686316332314, + "clip_ratio/high_mean": 0.000991379471088294, + "clip_ratio/low_mean": 0.0012978556478628889, + "clip_ratio/low_min": 8.788790728431195e-05, + "clip_ratio/region_mean": 0.0022892351262271404, + "epoch": 0.8678915135608049, + "grad_norm": 0.20154787600040436, + "learning_rate": 1e-06, + "loss": -0.0205, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0015627559732820373, + "clip_ratio/high_mean": 0.0007060485222609714, + "clip_ratio/low_mean": 0.0005485578758452903, + "clip_ratio/low_min": 9.521632819087245e-06, + "clip_ratio/region_mean": 0.0012546063589979894, + "completions/clipped_ratio": 0.0803571428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4008.0, + "completions/mean_length": 925.9285888671875, + "completions/mean_terminated_length": 648.9320678710938, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.8702245552639254, + "grad_norm": 0.2302597016096115, + "learning_rate": 1e-06, + "loss": -0.0179, + "num_tokens": 57143752.0, + "reward": 0.5613839626312256, + "reward_std": 0.17092610895633698, + "rewards/verify_math_reward/mean": 0.5613839030265808, + "rewards/verify_math_reward/std": 0.496494859457016, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0020515899450401776, + "clip_ratio/high_mean": 0.0007992923783604056, + "clip_ratio/low_mean": 0.0007848022023608792, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015840945925447159, + "epoch": 0.8725575969670458, + "grad_norm": 0.17715898156166077, + "learning_rate": 1e-06, + "loss": -0.0181, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0024287963824463077, + "clip_ratio/high_mean": 0.0008912174562283326, + "clip_ratio/low_mean": 0.0009563523071847158, + "clip_ratio/low_min": 2.8564900276251137e-05, + "clip_ratio/region_mean": 0.0018475697725079954, + "epoch": 0.8748906386701663, + "grad_norm": 0.245353102684021, + "learning_rate": 1e-06, + "loss": -0.0182, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0021456541326188017, + "clip_ratio/high_mean": 0.0008803438231552718, + "clip_ratio/low_mean": 0.001216527227370534, + "clip_ratio/low_min": 2.196836612711195e-05, + "clip_ratio/region_mean": 0.002096871074172668, + "epoch": 0.8772236803732867, + "grad_norm": 0.15898433327674866, + "learning_rate": 1e-06, + "loss": -0.0184, + "step": 376 + }, + { + "clip_ratio/high_max": 0.002165385289117694, + "clip_ratio/high_mean": 0.0008119576868921285, + "clip_ratio/low_mean": 0.000668839120407938, + "clip_ratio/low_min": 2.529340417822823e-05, + "clip_ratio/region_mean": 0.0014807967891101725, + "completions/clipped_ratio": 0.0915178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3139.0, + "completions/mean_length": 963.4910888671875, + "completions/mean_terminated_length": 647.9312133789062, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.8795567220764071, + "grad_norm": 0.21807657182216644, + "learning_rate": 1e-06, + "loss": -0.0364, + "num_tokens": 57758664.0, + "reward": 0.5412946939468384, + "reward_std": 0.19512638449668884, + "rewards/verify_math_reward/mean": 0.5412946343421936, + "rewards/verify_math_reward/std": 0.49857014417648315, + "step": 377 + }, + { + "clip_ratio/high_max": 0.002510105274268426, + "clip_ratio/high_mean": 0.001023588039970491, + "clip_ratio/low_mean": 0.0008575574447604595, + "clip_ratio/low_min": 3.056949663005071e-05, + "clip_ratio/region_mean": 0.0018811454792739823, + "epoch": 0.8818897637795275, + "grad_norm": 0.24140609800815582, + "learning_rate": 1e-06, + "loss": -0.0367, + "step": 378 + }, + { + "clip_ratio/high_max": 0.00252076792094158, + "clip_ratio/high_mean": 0.0010582318172964733, + "clip_ratio/low_mean": 0.0010428122513985727, + "clip_ratio/low_min": 3.6654876566899475e-05, + "clip_ratio/region_mean": 0.0021010440614190884, + "epoch": 0.884222805482648, + "grad_norm": 0.1708415299654007, + "learning_rate": 1e-06, + "loss": -0.0368, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0025287936223321594, + "clip_ratio/high_mean": 0.0010501180331630167, + "clip_ratio/low_mean": 0.0013154496082279366, + "clip_ratio/low_min": 9.351067819807213e-05, + "clip_ratio/region_mean": 0.002365567663218826, + "epoch": 0.8865558471857684, + "grad_norm": 0.17794418334960938, + "learning_rate": 1e-06, + "loss": -0.0369, + "step": 380 + }, + { + "clip_ratio/high_max": 0.002487028541509062, + "clip_ratio/high_mean": 0.0010293005434505176, + "clip_ratio/low_mean": 0.000717295255526551, + "clip_ratio/low_min": 1.2608432371052913e-05, + "clip_ratio/region_mean": 0.0017465958153479733, + "completions/clipped_ratio": 0.0837053571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3377.0, + "completions/mean_length": 919.2344360351562, + "completions/mean_terminated_length": 629.0304565429688, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.8888888888888888, + "grad_norm": 0.24101272225379944, + "learning_rate": 1e-06, + "loss": -0.0444, + "num_tokens": 58358554.0, + "reward": 0.5625, + "reward_std": 0.2137964516878128, + "rewards/verify_math_reward/mean": 0.5625, + "rewards/verify_math_reward/std": 0.49635544419288635, + "step": 381 + }, + { + "clip_ratio/high_max": 0.002524464900488965, + "clip_ratio/high_mean": 0.0012198094445921015, + "clip_ratio/low_mean": 0.0009389289261889644, + "clip_ratio/low_min": 1.2608432371052913e-05, + "clip_ratio/region_mean": 0.002158738367143087, + "epoch": 0.8912219305920094, + "grad_norm": 0.20234255492687225, + "learning_rate": 1e-06, + "loss": -0.0447, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0027121251841890626, + "clip_ratio/high_mean": 0.0012514218615251593, + "clip_ratio/low_mean": 0.0012125361117796274, + "clip_ratio/low_min": 1.2014609637844842e-05, + "clip_ratio/region_mean": 0.0024639579642098397, + "epoch": 0.8935549722951298, + "grad_norm": 0.18023191392421722, + "learning_rate": 1e-06, + "loss": -0.0449, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0028525425732368603, + "clip_ratio/high_mean": 0.0012203649239381775, + "clip_ratio/low_mean": 0.0013325776926649269, + "clip_ratio/low_min": 4.805843855137937e-05, + "clip_ratio/region_mean": 0.0025529425765853375, + "epoch": 0.8958880139982502, + "grad_norm": 0.2880335748195648, + "learning_rate": 1e-06, + "loss": -0.0449, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0023073976262821816, + "clip_ratio/high_mean": 0.0009232878073817119, + "clip_ratio/low_mean": 0.000623939338765922, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015472271697944961, + "completions/clipped_ratio": 0.1082589285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3576.0, + "completions/mean_length": 988.0011596679688, + "completions/mean_terminated_length": 610.6846313476562, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 0.8982210557013707, + "grad_norm": 0.22138522565364838, + "learning_rate": 1e-06, + "loss": -0.0393, + "num_tokens": 58937787.0, + "reward": 0.546875, + "reward_std": 0.17461372911930084, + "rewards/verify_math_reward/mean": 0.546875, + "rewards/verify_math_reward/std": 0.4980759024620056, + "step": 385 + }, + { + "clip_ratio/high_max": 0.002377451295615174, + "clip_ratio/high_mean": 0.000971550065514748, + "clip_ratio/low_mean": 0.0008350397292815614, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001806589811167214, + "epoch": 0.9005540974044911, + "grad_norm": 0.21801231801509857, + "learning_rate": 1e-06, + "loss": -0.0396, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0024675863969605416, + "clip_ratio/high_mean": 0.001060649034116068, + "clip_ratio/low_mean": 0.0009746981941134436, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020353472282295115, + "epoch": 0.9028871391076115, + "grad_norm": 0.18678408861160278, + "learning_rate": 1e-06, + "loss": -0.0397, + "step": 387 + }, + { + "clip_ratio/high_max": 0.002575601582066156, + "clip_ratio/high_mean": 0.0010663571447366849, + "clip_ratio/low_mean": 0.0012509522930486128, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002317309415957425, + "epoch": 0.905220180810732, + "grad_norm": 0.19864848256111145, + "learning_rate": 1e-06, + "loss": -0.0399, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0014403713648789562, + "clip_ratio/high_mean": 0.0005397231088863919, + "clip_ratio/low_mean": 0.0004968672092218185, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010365903108322527, + "completions/clipped_ratio": 0.1383928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3012.0, + "completions/mean_length": 1146.665283203125, + "completions/mean_terminated_length": 672.9378051757812, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.9075532225138524, + "grad_norm": 0.2242993712425232, + "learning_rate": 1e-06, + "loss": -0.0327, + "num_tokens": 59543223.0, + "reward": 0.4933035969734192, + "reward_std": 0.14199379086494446, + "rewards/verify_math_reward/mean": 0.4933035671710968, + "rewards/verify_math_reward/std": 0.5002344250679016, + "step": 389 + }, + { + "clip_ratio/high_max": 0.001978857944777701, + "clip_ratio/high_mean": 0.0007474165613530204, + "clip_ratio/low_mean": 0.0006779378245482803, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014253543886297848, + "epoch": 0.9098862642169728, + "grad_norm": 0.1980229914188385, + "learning_rate": 1e-06, + "loss": -0.033, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0021036925827502273, + "clip_ratio/high_mean": 0.0007958486821735278, + "clip_ratio/low_mean": 0.0007948368183861021, + "clip_ratio/low_min": 3.2445059332530946e-05, + "clip_ratio/region_mean": 0.0015906855296634603, + "epoch": 0.9122193059200934, + "grad_norm": 0.18185946345329285, + "learning_rate": 1e-06, + "loss": -0.0331, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0019272925237601157, + "clip_ratio/high_mean": 0.0007416932021442335, + "clip_ratio/low_mean": 0.0009933531619026326, + "clip_ratio/low_min": 3.4594097087392583e-05, + "clip_ratio/region_mean": 0.0017350463895127177, + "epoch": 0.9145523476232138, + "grad_norm": 0.16489240527153015, + "learning_rate": 1e-06, + "loss": -0.0331, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0018829276377800852, + "clip_ratio/high_mean": 0.0008267601297120564, + "clip_ratio/low_mean": 0.0007134396828405443, + "clip_ratio/low_min": 3.410331191844307e-05, + "clip_ratio/region_mean": 0.0015401998243760318, + "completions/clipped_ratio": 0.1171875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4007.0, + "completions/mean_length": 1032.266845703125, + "completions/mean_terminated_length": 625.5765380859375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.9168853893263342, + "grad_norm": 0.22167567908763885, + "learning_rate": 1e-06, + "loss": -0.0315, + "num_tokens": 60137270.0, + "reward": 0.5111607313156128, + "reward_std": 0.17851904034614563, + "rewards/verify_math_reward/mean": 0.5111607313156128, + "rewards/verify_math_reward/std": 0.5001546144485474, + "step": 393 + }, + { + "clip_ratio/high_max": 0.002248465272714384, + "clip_ratio/high_mean": 0.0009830765520746354, + "clip_ratio/low_mean": 0.000986932851446909, + "clip_ratio/low_min": 4.5183445763541386e-05, + "clip_ratio/region_mean": 0.0019700093725987244, + "epoch": 0.9192184310294547, + "grad_norm": 0.21001669764518738, + "learning_rate": 1e-06, + "loss": -0.0318, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0024827140441630036, + "clip_ratio/high_mean": 0.0010166514584852848, + "clip_ratio/low_mean": 0.00117380219126062, + "clip_ratio/low_min": 5.6479304475942627e-05, + "clip_ratio/region_mean": 0.0021904536624788307, + "epoch": 0.9215514727325751, + "grad_norm": 0.20626439154148102, + "learning_rate": 1e-06, + "loss": -0.0319, + "step": 395 + }, + { + "clip_ratio/high_max": 0.002260945359012112, + "clip_ratio/high_mean": 0.0009840688053373015, + "clip_ratio/low_mean": 0.0013305551474331878, + "clip_ratio/low_min": 5.683884955942631e-05, + "clip_ratio/region_mean": 0.002314623910933733, + "epoch": 0.9238845144356955, + "grad_norm": 0.18903161585330963, + "learning_rate": 1e-06, + "loss": -0.032, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0024143254122463986, + "clip_ratio/high_mean": 0.0008168222502717981, + "clip_ratio/low_mean": 0.0005513674987014383, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001368189728964353, + "completions/clipped_ratio": 0.1495535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3614.0, + "completions/mean_length": 1145.763427734375, + "completions/mean_terminated_length": 626.9553833007812, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.926217556138816, + "grad_norm": 0.22756241261959076, + "learning_rate": 1e-06, + "loss": -0.05, + "num_tokens": 60705274.0, + "reward": 0.5345982313156128, + "reward_std": 0.156542107462883, + "rewards/verify_math_reward/mean": 0.5345982313156128, + "rewards/verify_math_reward/std": 0.499080091714859, + "step": 397 + }, + { + "clip_ratio/high_max": 0.002396913645497989, + "clip_ratio/high_mean": 0.0009040411860041786, + "clip_ratio/low_mean": 0.0007567752163595287, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016608163641649298, + "epoch": 0.9285505978419364, + "grad_norm": 0.1984955072402954, + "learning_rate": 1e-06, + "loss": -0.0502, + "step": 398 + }, + { + "clip_ratio/high_max": 0.002589438794529997, + "clip_ratio/high_mean": 0.0009209879790432751, + "clip_ratio/low_mean": 0.0009547004647174617, + "clip_ratio/low_min": 1.715148209768813e-05, + "clip_ratio/region_mean": 0.0018756884637696203, + "epoch": 0.9308836395450568, + "grad_norm": 0.18810021877288818, + "learning_rate": 1e-06, + "loss": -0.0503, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0030086884726188146, + "clip_ratio/high_mean": 0.0010029266068158904, + "clip_ratio/low_mean": 0.001136030974521418, + "clip_ratio/low_min": 3.430296419537626e-05, + "clip_ratio/region_mean": 0.002138957632269012, + "epoch": 0.9332166812481774, + "grad_norm": 0.1831684708595276, + "learning_rate": 1e-06, + "loss": -0.0504, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0022944446245674044, + "clip_ratio/high_mean": 0.0009729608082125196, + "clip_ratio/low_mean": 0.0006262494916882133, + "clip_ratio/low_min": 2.6870164219872095e-05, + "clip_ratio/region_mean": 0.001599210318090627, + "completions/clipped_ratio": 0.0982142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3530.0, + "completions/mean_length": 968.2835083007812, + "completions/mean_terminated_length": 627.6410522460938, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.9355497229512978, + "grad_norm": 0.2524567246437073, + "learning_rate": 1e-06, + "loss": -0.0506, + "num_tokens": 61297928.0, + "reward": 0.5691964626312256, + "reward_std": 0.1988469660282135, + "rewards/verify_math_reward/mean": 0.5691964030265808, + "rewards/verify_math_reward/std": 0.4954652488231659, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0024275017785839736, + "clip_ratio/high_mean": 0.0011202143268747022, + "clip_ratio/low_mean": 0.0008954741042543901, + "clip_ratio/low_min": 2.6870164219872095e-05, + "clip_ratio/region_mean": 0.0020156884420430288, + "epoch": 0.9378827646544182, + "grad_norm": 0.19105035066604614, + "learning_rate": 1e-06, + "loss": -0.0508, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0025642600739956833, + "clip_ratio/high_mean": 0.0011850045411847532, + "clip_ratio/low_mean": 0.0010663451394066215, + "clip_ratio/low_min": 8.061048720264807e-05, + "clip_ratio/region_mean": 0.0022513496660394594, + "epoch": 0.9402158063575387, + "grad_norm": 0.1914723962545395, + "learning_rate": 1e-06, + "loss": -0.0509, + "step": 403 + }, + { + "clip_ratio/high_max": 0.002771794876025524, + "clip_ratio/high_mean": 0.0011238851147936657, + "clip_ratio/low_mean": 0.0012456118020054419, + "clip_ratio/low_min": 5.5915901612024754e-05, + "clip_ratio/region_mean": 0.0023694968404015526, + "epoch": 0.9425488480606591, + "grad_norm": 0.1938973218202591, + "learning_rate": 1e-06, + "loss": -0.0511, + "step": 404 + }, + { + "clip_ratio/high_max": 0.002230028661870165, + "clip_ratio/high_mean": 0.0007739496850263095, + "clip_ratio/low_mean": 0.0006409202414943138, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00141486987558892, + "completions/clipped_ratio": 0.0982142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3528.0, + "completions/mean_length": 962.7857666015625, + "completions/mean_terminated_length": 621.5445556640625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.9448818897637795, + "grad_norm": 0.22243300080299377, + "learning_rate": 1e-06, + "loss": -0.0346, + "num_tokens": 61900048.0, + "reward": 0.5167410969734192, + "reward_std": 0.155902698636055, + "rewards/verify_math_reward/mean": 0.5167410969734192, + "rewards/verify_math_reward/std": 0.4999987483024597, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0027140029123984277, + "clip_ratio/high_mean": 0.0009580726582498755, + "clip_ratio/low_mean": 0.0008908036925276974, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018488763671484776, + "epoch": 0.9472149314669, + "grad_norm": 0.19398583471775055, + "learning_rate": 1e-06, + "loss": -0.0348, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0027486858452903107, + "clip_ratio/high_mean": 0.0010101074276462896, + "clip_ratio/low_mean": 0.0010566887012828374, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020667961143772118, + "epoch": 0.9495479731700204, + "grad_norm": 0.17073506116867065, + "learning_rate": 1e-06, + "loss": -0.035, + "step": 407 + }, + { + "clip_ratio/high_max": 0.002703230216866359, + "clip_ratio/high_mean": 0.0008891025681805331, + "clip_ratio/low_mean": 0.0011867632620123914, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002075865850201808, + "epoch": 0.9518810148731408, + "grad_norm": 0.17584951221942902, + "learning_rate": 1e-06, + "loss": -0.035, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0017499868990853429, + "clip_ratio/high_mean": 0.0006895441420056159, + "clip_ratio/low_mean": 0.0005366919976950157, + "clip_ratio/low_min": 2.0552450223476626e-05, + "clip_ratio/region_mean": 0.001226236159709515, + "completions/clipped_ratio": 0.1127232142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3481.0, + "completions/mean_length": 1060.05810546875, + "completions/mean_terminated_length": 674.3597412109375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.9542140565762613, + "grad_norm": 0.21991465985774994, + "learning_rate": 1e-06, + "loss": -0.0314, + "num_tokens": 62523628.0, + "reward": 0.5178571939468384, + "reward_std": 0.1661185324192047, + "rewards/verify_math_reward/mean": 0.5178571343421936, + "rewards/verify_math_reward/std": 0.4999600946903229, + "step": 409 + }, + { + "clip_ratio/high_max": 0.002041467319941148, + "clip_ratio/high_mean": 0.0008404169966524933, + "clip_ratio/low_mean": 0.0008108198853733484, + "clip_ratio/low_min": 9.36329615797149e-06, + "clip_ratio/region_mean": 0.0016512368674739264, + "epoch": 0.9565470982793818, + "grad_norm": 0.2058495134115219, + "learning_rate": 1e-06, + "loss": -0.0316, + "step": 410 + }, + { + "clip_ratio/high_max": 0.002102226069837343, + "clip_ratio/high_mean": 0.000885735877091065, + "clip_ratio/low_mean": 0.0010361661043134518, + "clip_ratio/low_min": 3.745318463188596e-05, + "clip_ratio/region_mean": 0.0019219019668526016, + "epoch": 0.9588801399825022, + "grad_norm": 0.16779199242591858, + "learning_rate": 1e-06, + "loss": -0.0318, + "step": 411 + }, + { + "clip_ratio/high_max": 0.002049068563792389, + "clip_ratio/high_mean": 0.0008057550767262001, + "clip_ratio/low_mean": 0.0011653085093712434, + "clip_ratio/low_min": 1.872659231594298e-05, + "clip_ratio/region_mean": 0.00197106359701138, + "epoch": 0.9612131816856226, + "grad_norm": 0.20040445029735565, + "learning_rate": 1e-06, + "loss": -0.0319, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0021113881448400207, + "clip_ratio/high_mean": 0.0008135189400491072, + "clip_ratio/low_mean": 0.0006807227355238865, + "clip_ratio/low_min": 5.56997511012014e-05, + "clip_ratio/region_mean": 0.0014942416819394566, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3887.0, + "completions/mean_length": 1005.1975708007812, + "completions/mean_terminated_length": 625.6253051757812, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.9635462233887431, + "grad_norm": 0.2439853399991989, + "learning_rate": 1e-06, + "loss": -0.0488, + "num_tokens": 63115821.0, + "reward": 0.5546875, + "reward_std": 0.171687051653862, + "rewards/verify_math_reward/mean": 0.5546875, + "rewards/verify_math_reward/std": 0.4972778558731079, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0029173197981435806, + "clip_ratio/high_mean": 0.001055622829881031, + "clip_ratio/low_mean": 0.0009044086964422604, + "clip_ratio/low_min": 3.234574705857085e-05, + "clip_ratio/region_mean": 0.001960031564522069, + "epoch": 0.9658792650918635, + "grad_norm": 0.212842658162117, + "learning_rate": 1e-06, + "loss": -0.0491, + "step": 414 + }, + { + "clip_ratio/high_max": 0.002813082253851462, + "clip_ratio/high_mean": 0.0010721070702857105, + "clip_ratio/low_mean": 0.0011335684757796116, + "clip_ratio/low_min": 0.00010438599565532058, + "clip_ratio/region_mean": 0.0022056755478843115, + "epoch": 0.9682123067949839, + "grad_norm": 0.18718186020851135, + "learning_rate": 1e-06, + "loss": -0.0492, + "step": 415 + }, + { + "clip_ratio/high_max": 0.002706474653678015, + "clip_ratio/high_mean": 0.0010399333223176654, + "clip_ratio/low_mean": 0.0013874257856514305, + "clip_ratio/low_min": 0.00010292992737959139, + "clip_ratio/region_mean": 0.0024273590970551595, + "epoch": 0.9705453484981044, + "grad_norm": 0.16945691406726837, + "learning_rate": 1e-06, + "loss": -0.0493, + "step": 416 + }, + { + "clip_ratio/high_max": 0.002532029800931923, + "clip_ratio/high_mean": 0.00104540411484777, + "clip_ratio/low_mean": 0.0006206272819326841, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001666031384957023, + "completions/clipped_ratio": 0.0948660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3944.0, + "completions/mean_length": 929.4297485351562, + "completions/mean_terminated_length": 597.5449829101562, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.9728783902012248, + "grad_norm": 0.2921268939971924, + "learning_rate": 1e-06, + "loss": -0.027, + "num_tokens": 63688326.0, + "reward": 0.5401785969734192, + "reward_std": 0.19787125289440155, + "rewards/verify_math_reward/mean": 0.5401785969734192, + "rewards/verify_math_reward/std": 0.49866142868995667, + "step": 417 + }, + { + "clip_ratio/high_max": 0.002880187767004827, + "clip_ratio/high_mean": 0.0012010327791358577, + "clip_ratio/low_mean": 0.0008800021423667204, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020810349014936946, + "epoch": 0.9752114319043453, + "grad_norm": 0.22427064180374146, + "learning_rate": 1e-06, + "loss": -0.0273, + "step": 418 + }, + { + "clip_ratio/high_max": 0.002787976758554578, + "clip_ratio/high_mean": 0.0011979122064076364, + "clip_ratio/low_mean": 0.0011710269518516725, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002368939225561917, + "epoch": 0.9775444736074658, + "grad_norm": 0.22764481604099274, + "learning_rate": 1e-06, + "loss": -0.0275, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0028928016909048893, + "clip_ratio/high_mean": 0.0012647529365494847, + "clip_ratio/low_mean": 0.0013513580543076387, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026161109999520704, + "epoch": 0.9798775153105862, + "grad_norm": 0.19134294986724854, + "learning_rate": 1e-06, + "loss": -0.0276, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0018266616134496871, + "clip_ratio/high_mean": 0.0007190967153292149, + "clip_ratio/low_mean": 0.0005109191979499883, + "clip_ratio/low_min": 1.6720170606276952e-05, + "clip_ratio/region_mean": 0.001230015892360825, + "completions/clipped_ratio": 0.1037946428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3565.0, + "completions/mean_length": 982.4832763671875, + "completions/mean_terminated_length": 621.88916015625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.9822105570137066, + "grad_norm": 0.20579728484153748, + "learning_rate": 1e-06, + "loss": -0.0488, + "num_tokens": 64269895.0, + "reward": 0.574776828289032, + "reward_std": 0.14571575820446014, + "rewards/verify_math_reward/mean": 0.5747767686843872, + "rewards/verify_math_reward/std": 0.49465295672416687, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0022223303021746688, + "clip_ratio/high_mean": 0.0008412895622313954, + "clip_ratio/low_mean": 0.0006025078419042984, + "clip_ratio/low_min": 1.1704119970090687e-05, + "clip_ratio/region_mean": 0.0014437974132306408, + "epoch": 0.9845435987168271, + "grad_norm": 0.1702871322631836, + "learning_rate": 1e-06, + "loss": -0.0489, + "step": 422 + }, + { + "clip_ratio/high_max": 0.002629214672197122, + "clip_ratio/high_mean": 0.0008907833580451552, + "clip_ratio/low_mean": 0.000756228415411897, + "clip_ratio/low_min": 4.724111931864172e-05, + "clip_ratio/region_mean": 0.0016470117516291793, + "epoch": 0.9868766404199475, + "grad_norm": 0.18380087614059448, + "learning_rate": 1e-06, + "loss": -0.0491, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0024758439176366664, + "clip_ratio/high_mean": 0.0008649492338008713, + "clip_ratio/low_mean": 0.0009179535700241104, + "clip_ratio/low_min": 7.022471982054412e-05, + "clip_ratio/region_mean": 0.0017829028074629605, + "epoch": 0.9892096821230679, + "grad_norm": 0.1679142862558365, + "learning_rate": 1e-06, + "loss": -0.0491, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0023570269368065055, + "clip_ratio/high_mean": 0.0010959362571156817, + "clip_ratio/low_mean": 0.000697734321875032, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017936705917236395, + "completions/clipped_ratio": 0.1116071428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3606.0, + "completions/mean_length": 995.2154541015625, + "completions/mean_terminated_length": 605.6696166992188, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.9915427238261884, + "grad_norm": 0.310123085975647, + "learning_rate": 1e-06, + "loss": -0.0432, + "num_tokens": 64848512.0, + "reward": 0.5412946939468384, + "reward_std": 0.20440296828746796, + "rewards/verify_math_reward/mean": 0.5412946343421936, + "rewards/verify_math_reward/std": 0.49857014417648315, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0029505515594792087, + "clip_ratio/high_mean": 0.0013292095172801055, + "clip_ratio/low_mean": 0.001031558427712298, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023607679031556472, + "epoch": 0.9938757655293088, + "grad_norm": 0.25105151534080505, + "learning_rate": 1e-06, + "loss": -0.0435, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0030129859660519287, + "clip_ratio/high_mean": 0.0014149835515127052, + "clip_ratio/low_mean": 0.001323643886280479, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002738627459621057, + "epoch": 0.9962088072324293, + "grad_norm": 0.2301369458436966, + "learning_rate": 1e-06, + "loss": -0.0438, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0031693995551904663, + "clip_ratio/high_mean": 0.001362594950478524, + "clip_ratio/low_mean": 0.0015099316769919824, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0028725266965921037, + "epoch": 0.9985418489355498, + "grad_norm": 0.21407221257686615, + "learning_rate": 1e-06, + "loss": -0.0439, + "step": 428 + }, + { + "clip_ratio/high_max": 0.00188614493526984, + "clip_ratio/high_mean": 0.000804677469204762, + "clip_ratio/low_mean": 0.0006909416897542542, + "clip_ratio/low_min": 1.371215421386296e-05, + "clip_ratio/region_mean": 0.0014956191917008255, + "completions/clipped_ratio": 0.0669642857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3768.0, + "completions/mean_length": 850.3594360351562, + "completions/mean_terminated_length": 617.4186401367188, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 1.0023330417031204, + "grad_norm": 0.2333025187253952, + "learning_rate": 1e-06, + "loss": -0.0315, + "num_tokens": 65455098.0, + "reward": 0.6350446939468384, + "reward_std": 0.17867198586463928, + "rewards/verify_math_reward/mean": 0.6350446343421936, + "rewards/verify_math_reward/std": 0.481686532497406, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0023995939482119866, + "clip_ratio/high_mean": 0.0010416395452921279, + "clip_ratio/low_mean": 0.0009332010322395945, + "clip_ratio/low_min": 4.335260018706322e-05, + "clip_ratio/region_mean": 0.001974840590264648, + "epoch": 1.0046660834062409, + "grad_norm": 0.2440246194601059, + "learning_rate": 1e-06, + "loss": -0.0317, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0022679026078549214, + "clip_ratio/high_mean": 0.0009606390412955079, + "clip_ratio/low_mean": 0.0011004831012542127, + "clip_ratio/low_min": 1.7936576114152558e-05, + "clip_ratio/region_mean": 0.0020611221698345616, + "epoch": 1.0069991251093613, + "grad_norm": 0.17937007546424866, + "learning_rate": 1e-06, + "loss": -0.0319, + "step": 431 + }, + { + "clip_ratio/high_max": 0.002265503804665059, + "clip_ratio/high_mean": 0.0009249858303519432, + "clip_ratio/low_mean": 0.0011825941983261146, + "clip_ratio/low_min": 8.968288057076279e-06, + "clip_ratio/region_mean": 0.0021075799959362485, + "epoch": 1.0093321668124817, + "grad_norm": 0.18658733367919922, + "learning_rate": 1e-06, + "loss": -0.032, + "step": 432 + }, + { + "clip_ratio/high_max": 0.002188250800827518, + "clip_ratio/high_mean": 0.0007850914971641032, + "clip_ratio/low_mean": 0.0006092657213230268, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013943572448624764, + "completions/clipped_ratio": 0.0959821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3200.0, + "completions/mean_length": 925.9609985351562, + "completions/mean_terminated_length": 589.388916015625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 1.0116652085156022, + "grad_norm": 0.24572688341140747, + "learning_rate": 1e-06, + "loss": -0.0442, + "num_tokens": 66028951.0, + "reward": 0.637276828289032, + "reward_std": 0.15319034457206726, + "rewards/verify_math_reward/mean": 0.6372767686843872, + "rewards/verify_math_reward/std": 0.481054425239563, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0027622638517641462, + "clip_ratio/high_mean": 0.0010239535840810277, + "clip_ratio/low_mean": 0.0007826659921192913, + "clip_ratio/low_min": 2.5562372684362344e-05, + "clip_ratio/region_mean": 0.0018066195843857713, + "epoch": 1.0139982502187226, + "grad_norm": 0.22620150446891785, + "learning_rate": 1e-06, + "loss": -0.0445, + "step": 434 + }, + { + "clip_ratio/high_max": 0.002733622441155603, + "clip_ratio/high_mean": 0.0009858197190624196, + "clip_ratio/low_mean": 0.0010387497350166086, + "clip_ratio/low_min": 1.6297262845910154e-05, + "clip_ratio/region_mean": 0.0020245694468030706, + "epoch": 1.016331291921843, + "grad_norm": 0.18662086129188538, + "learning_rate": 1e-06, + "loss": -0.0447, + "step": 435 + }, + { + "clip_ratio/high_max": 0.002920517210441176, + "clip_ratio/high_mean": 0.001070248461473966, + "clip_ratio/low_mean": 0.001208086943734088, + "clip_ratio/low_min": 3.259452569182031e-05, + "clip_ratio/region_mean": 0.00227833535609534, + "epoch": 1.0186643336249634, + "grad_norm": 0.19832929968833923, + "learning_rate": 1e-06, + "loss": -0.0448, + "step": 436 + }, + { + "clip_ratio/high_max": 0.00233198835485382, + "clip_ratio/high_mean": 0.000919369622351951, + "clip_ratio/low_mean": 0.0005849066842529282, + "clip_ratio/low_min": 2.708247848204337e-05, + "clip_ratio/region_mean": 0.0015042763370729517, + "completions/clipped_ratio": 0.1339285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3678.0, + "completions/mean_length": 1107.8035888671875, + "completions/mean_terminated_length": 645.7113037109375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 1.020997375328084, + "grad_norm": 0.25459355115890503, + "learning_rate": 1e-06, + "loss": -0.0649, + "num_tokens": 66623023.0, + "reward": 0.5334821939468384, + "reward_std": 0.1828376203775406, + "rewards/verify_math_reward/mean": 0.5334821343421936, + "rewards/verify_math_reward/std": 0.49915632605552673, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0032297305806423537, + "clip_ratio/high_mean": 0.0012008415415039053, + "clip_ratio/low_mean": 0.0008006979960555327, + "clip_ratio/low_min": 2.0398172637214884e-05, + "clip_ratio/region_mean": 0.00200153960759053, + "epoch": 1.0233304170312045, + "grad_norm": 0.22269797325134277, + "learning_rate": 1e-06, + "loss": -0.0652, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0029371954951784573, + "clip_ratio/high_mean": 0.0011158537872688612, + "clip_ratio/low_mean": 0.0009924400619638618, + "clip_ratio/low_min": 5.387098099163268e-05, + "clip_ratio/region_mean": 0.0021082937964820303, + "epoch": 1.025663458734325, + "grad_norm": 0.17895840108394623, + "learning_rate": 1e-06, + "loss": -0.0654, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0031041721085784957, + "clip_ratio/high_mean": 0.0011557863945199642, + "clip_ratio/low_mean": 0.0011620121422311058, + "clip_ratio/low_min": 6.109514288255014e-05, + "clip_ratio/region_mean": 0.0023177985203801654, + "epoch": 1.0279965004374454, + "grad_norm": 0.1888708770275116, + "learning_rate": 1e-06, + "loss": -0.0655, + "step": 440 + }, + { + "clip_ratio/high_max": 0.002363800063903909, + "clip_ratio/high_mean": 0.000765284659792087, + "clip_ratio/low_mean": 0.0006764551799278706, + "clip_ratio/low_min": 2.4149921955540776e-05, + "clip_ratio/region_mean": 0.00144173986336682, + "completions/clipped_ratio": 0.1205357142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3328.0, + "completions/mean_length": 1028.1195068359375, + "completions/mean_terminated_length": 607.6484375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 1.0303295421405658, + "grad_norm": 0.29041218757629395, + "learning_rate": 1e-06, + "loss": -0.0467, + "num_tokens": 67206226.0, + "reward": 0.5055803656578064, + "reward_std": 0.1731078028678894, + "rewards/verify_math_reward/mean": 0.5055803656578064, + "rewards/verify_math_reward/std": 0.5002480745315552, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0028002179751638323, + "clip_ratio/high_mean": 0.0009567866181896534, + "clip_ratio/low_mean": 0.0009720387351990212, + "clip_ratio/low_min": 1.2074960977770388e-05, + "clip_ratio/region_mean": 0.0019288253606646322, + "epoch": 1.0326625838436863, + "grad_norm": 0.21335922181606293, + "learning_rate": 1e-06, + "loss": -0.047, + "step": 442 + }, + { + "clip_ratio/high_max": 0.002605081615911331, + "clip_ratio/high_mean": 0.0009598919823474716, + "clip_ratio/low_mean": 0.001113256159442244, + "clip_ratio/low_min": 1.4397603990801144e-05, + "clip_ratio/region_mean": 0.002073148156341631, + "epoch": 1.0349956255468067, + "grad_norm": 0.1877458244562149, + "learning_rate": 1e-06, + "loss": -0.0472, + "step": 443 + }, + { + "clip_ratio/high_max": 0.002722404162341263, + "clip_ratio/high_mean": 0.0009160343215626199, + "clip_ratio/low_mean": 0.001470162289479049, + "clip_ratio/low_min": 5.759041596320458e-05, + "clip_ratio/region_mean": 0.0023861965819378383, + "epoch": 1.0373286672499271, + "grad_norm": 0.17515434324741364, + "learning_rate": 1e-06, + "loss": -0.0473, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0018894981767516583, + "clip_ratio/high_mean": 0.0007696930078964215, + "clip_ratio/low_mean": 0.0004294440727790061, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011991370738542173, + "completions/clipped_ratio": 0.1261160714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3809.0, + "completions/mean_length": 1033.2054443359375, + "completions/mean_terminated_length": 591.1928100585938, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 1.0396617089530475, + "grad_norm": 0.25584280490875244, + "learning_rate": 1e-06, + "loss": -0.0647, + "num_tokens": 67763706.0, + "reward": 0.6205357313156128, + "reward_std": 0.16398167610168457, + "rewards/verify_math_reward/mean": 0.6205357313156128, + "rewards/verify_math_reward/std": 0.4855247139930725, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0025248981764889322, + "clip_ratio/high_mean": 0.0010226915183011442, + "clip_ratio/low_mean": 0.0007047745557429153, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017274660640396178, + "epoch": 1.041994750656168, + "grad_norm": 0.20202475786209106, + "learning_rate": 1e-06, + "loss": -0.0649, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0024538811994716525, + "clip_ratio/high_mean": 0.0010547231504460797, + "clip_ratio/low_mean": 0.0008309453060064698, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018856684473576024, + "epoch": 1.0443277923592884, + "grad_norm": 0.17896480858325958, + "learning_rate": 1e-06, + "loss": -0.0651, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0026427094198879786, + "clip_ratio/high_mean": 0.0010373942131991498, + "clip_ratio/low_mean": 0.0009970391201932216, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020344333242974244, + "epoch": 1.0466608340624088, + "grad_norm": 0.17334654927253723, + "learning_rate": 1e-06, + "loss": -0.0652, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0017308068145212019, + "clip_ratio/high_mean": 0.0006800707551519736, + "clip_ratio/low_mean": 0.0004515814352998859, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00113165218863287, + "completions/clipped_ratio": 0.1149553571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3804.0, + "completions/mean_length": 1005.1350708007812, + "completions/mean_terminated_length": 603.6734008789062, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 1.0489938757655293, + "grad_norm": 0.2161651849746704, + "learning_rate": 1e-06, + "loss": -0.0636, + "num_tokens": 68333051.0, + "reward": 0.5703125, + "reward_std": 0.13084182143211365, + "rewards/verify_math_reward/mean": 0.5703125, + "rewards/verify_math_reward/std": 0.49530795216560364, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0022399268855224364, + "clip_ratio/high_mean": 0.0007800824223522795, + "clip_ratio/low_mean": 0.0006723454164330178, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001452427837648429, + "epoch": 1.0513269174686497, + "grad_norm": 0.16871459782123566, + "learning_rate": 1e-06, + "loss": -0.0639, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0026769223622977734, + "clip_ratio/high_mean": 0.0008887108624549, + "clip_ratio/low_mean": 0.000769434676385572, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016581455092818942, + "epoch": 1.0536599591717701, + "grad_norm": 0.17597126960754395, + "learning_rate": 1e-06, + "loss": -0.0639, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0022117251392046455, + "clip_ratio/high_mean": 0.0008225946094171377, + "clip_ratio/low_mean": 0.0008988589370346745, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017214535691891797, + "epoch": 1.0559930008748906, + "grad_norm": 0.16394490003585815, + "learning_rate": 1e-06, + "loss": -0.064, + "step": 452 + }, + { + "clip_ratio/high_max": 0.001844191509007942, + "clip_ratio/high_mean": 0.0006051035788914305, + "clip_ratio/low_mean": 0.0005907940394536126, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011958976356254425, + "completions/clipped_ratio": 0.1305803571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2765.0, + "completions/mean_length": 1068.271240234375, + "completions/mean_terminated_length": 613.5289306640625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 1.058326042578011, + "grad_norm": 0.24267460405826569, + "learning_rate": 1e-06, + "loss": -0.055, + "num_tokens": 68922438.0, + "reward": 0.5189732313156128, + "reward_std": 0.16856057941913605, + "rewards/verify_math_reward/mean": 0.5189732313156128, + "rewards/verify_math_reward/std": 0.49991893768310547, + "step": 453 + }, + { + "clip_ratio/high_max": 0.002777697729470674, + "clip_ratio/high_mean": 0.0010148937635676702, + "clip_ratio/low_mean": 0.0007792864762450336, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001794180272554513, + "epoch": 1.0606590842811314, + "grad_norm": 0.2198466807603836, + "learning_rate": 1e-06, + "loss": -0.0553, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0028655557398451492, + "clip_ratio/high_mean": 0.001007710066915024, + "clip_ratio/low_mean": 0.0010238405156997032, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020315505607868545, + "epoch": 1.0629921259842519, + "grad_norm": 0.18871402740478516, + "learning_rate": 1e-06, + "loss": -0.0555, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0023621154177817516, + "clip_ratio/high_mean": 0.0009254372853320092, + "clip_ratio/low_mean": 0.0011858456418849528, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021112828471814282, + "epoch": 1.0653251676873725, + "grad_norm": 0.1881953328847885, + "learning_rate": 1e-06, + "loss": -0.0556, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0020752472410094924, + "clip_ratio/high_mean": 0.0008432522045040969, + "clip_ratio/low_mean": 0.0003859125345115899, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012291647362872027, + "completions/clipped_ratio": 0.1071428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2376.0, + "completions/mean_length": 985.64404296875, + "completions/mean_terminated_length": 612.4012451171875, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 1.067658209390493, + "grad_norm": 0.2506106197834015, + "learning_rate": 1e-06, + "loss": -0.0379, + "num_tokens": 69500271.0, + "reward": 0.5558035969734192, + "reward_std": 0.15361177921295166, + "rewards/verify_math_reward/mean": 0.5558035969734192, + "rewards/verify_math_reward/std": 0.49715372920036316, + "step": 457 + }, + { + "clip_ratio/high_max": 0.002430212880426552, + "clip_ratio/high_mean": 0.0009447149423067458, + "clip_ratio/low_mean": 0.0006093982910897466, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001554113237943966, + "epoch": 1.0699912510936134, + "grad_norm": 0.19296887516975403, + "learning_rate": 1e-06, + "loss": -0.0382, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0026876567644649185, + "clip_ratio/high_mean": 0.0010661713204171974, + "clip_ratio/low_mean": 0.0007335922709899023, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017997636023210362, + "epoch": 1.0723242927967338, + "grad_norm": 0.21186336874961853, + "learning_rate": 1e-06, + "loss": -0.0383, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0026717531436588615, + "clip_ratio/high_mean": 0.0010539452923694625, + "clip_ratio/low_mean": 0.0009490556058153743, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002003000867262017, + "epoch": 1.0746573344998542, + "grad_norm": 0.1976313292980194, + "learning_rate": 1e-06, + "loss": -0.0384, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0017274421516049188, + "clip_ratio/high_mean": 0.0006933596951057552, + "clip_ratio/low_mean": 0.0006798601580157992, + "clip_ratio/low_min": 4.327072747400962e-05, + "clip_ratio/region_mean": 0.0013732198531215545, + "completions/clipped_ratio": 0.0904017857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3699.0, + "completions/mean_length": 932.0457763671875, + "completions/mean_terminated_length": 617.5914306640625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 1.0769903762029747, + "grad_norm": 0.270966500043869, + "learning_rate": 1e-06, + "loss": -0.0645, + "num_tokens": 70091448.0, + "reward": 0.6383928656578064, + "reward_std": 0.15631499886512756, + "rewards/verify_math_reward/mean": 0.6383928656578064, + "rewards/verify_math_reward/std": 0.4807341694831848, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0022845575695100706, + "clip_ratio/high_mean": 0.0008984553369373316, + "clip_ratio/low_mean": 0.0008425280557275983, + "clip_ratio/low_min": 2.961796326417243e-05, + "clip_ratio/region_mean": 0.0017409833853889722, + "epoch": 1.079323417906095, + "grad_norm": 0.1994178146123886, + "learning_rate": 1e-06, + "loss": -0.0647, + "step": 462 + }, + { + "clip_ratio/high_max": 0.002211950828495901, + "clip_ratio/high_mean": 0.0008875480789356516, + "clip_ratio/low_mean": 0.0010095132511196425, + "clip_ratio/low_min": 5.4537522373721004e-05, + "clip_ratio/region_mean": 0.0018970613418787252, + "epoch": 1.0816564596092155, + "grad_norm": 0.18105797469615936, + "learning_rate": 1e-06, + "loss": -0.0648, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0021518303838092834, + "clip_ratio/high_mean": 0.0008788373797870008, + "clip_ratio/low_mean": 0.0012122353618906345, + "clip_ratio/low_min": 5.429171142168343e-05, + "clip_ratio/region_mean": 0.002091072739858646, + "epoch": 1.083989501312336, + "grad_norm": 0.16402515769004822, + "learning_rate": 1e-06, + "loss": -0.065, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0024228825350292027, + "clip_ratio/high_mean": 0.0010354480000387412, + "clip_ratio/low_mean": 0.0005089695328024391, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015444175223819911, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2188.0, + "completions/mean_length": 959.919677734375, + "completions/mean_terminated_length": 605.4061889648438, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 1.0863225430154564, + "grad_norm": 0.28489306569099426, + "learning_rate": 1e-06, + "loss": -0.0359, + "num_tokens": 70663920.0, + "reward": 0.5948660969734192, + "reward_std": 0.19050613045692444, + "rewards/verify_math_reward/mean": 0.5948660969734192, + "rewards/verify_math_reward/std": 0.49119213223457336, + "step": 465 + }, + { + "clip_ratio/high_max": 0.002697384203202091, + "clip_ratio/high_mean": 0.0011934083595406264, + "clip_ratio/low_mean": 0.0007661204526812071, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019595287885749713, + "epoch": 1.0886555847185768, + "grad_norm": 0.2708839774131775, + "learning_rate": 1e-06, + "loss": -0.0362, + "step": 466 + }, + { + "clip_ratio/high_max": 0.00271049264119938, + "clip_ratio/high_mean": 0.00129466382713872, + "clip_ratio/low_mean": 0.0009820834166021086, + "clip_ratio/low_min": 2.2329402781906538e-05, + "clip_ratio/region_mean": 0.002276747276482638, + "epoch": 1.0909886264216972, + "grad_norm": 0.20637908577919006, + "learning_rate": 1e-06, + "loss": -0.0364, + "step": 467 + }, + { + "clip_ratio/high_max": 0.002703095513425069, + "clip_ratio/high_mean": 0.0012898474706162233, + "clip_ratio/low_mean": 0.0012216093127790373, + "clip_ratio/low_min": 2.2329402781906538e-05, + "clip_ratio/region_mean": 0.002511456848878879, + "epoch": 1.0933216681248177, + "grad_norm": 0.2538661062717438, + "learning_rate": 1e-06, + "loss": -0.0365, + "step": 468 + }, + { + "clip_ratio/high_max": 0.002326559078937862, + "clip_ratio/high_mean": 0.0008292814873129828, + "clip_ratio/low_mean": 0.0004958757281201542, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013251571945147589, + "completions/clipped_ratio": 0.1138392857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3780.0, + "completions/mean_length": 1031.818115234375, + "completions/mean_terminated_length": 638.1826171875, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 1.0956547098279381, + "grad_norm": 0.23984180390834808, + "learning_rate": 1e-06, + "loss": -0.0455, + "num_tokens": 71244989.0, + "reward": 0.640625, + "reward_std": 0.15570516884326935, + "rewards/verify_math_reward/mean": 0.640625, + "rewards/verify_math_reward/std": 0.48008525371551514, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0030103095414233394, + "clip_ratio/high_mean": 0.0009939269912138116, + "clip_ratio/low_mean": 0.0006345219062495744, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016284488810924813, + "epoch": 1.0979877515310585, + "grad_norm": 0.20683689415454865, + "learning_rate": 1e-06, + "loss": -0.0456, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0025437069343752228, + "clip_ratio/high_mean": 0.0008786136968410574, + "clip_ratio/low_mean": 0.0008079185627138941, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016865322904777713, + "epoch": 1.100320793234179, + "grad_norm": 0.18974076211452484, + "learning_rate": 1e-06, + "loss": -0.0458, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0029568531972472556, + "clip_ratio/high_mean": 0.0009815776320465375, + "clip_ratio/low_mean": 0.000939180177738308, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019207578152418137, + "epoch": 1.1026538349372994, + "grad_norm": 0.18512237071990967, + "learning_rate": 1e-06, + "loss": -0.0458, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0018132032637367956, + "clip_ratio/high_mean": 0.0007385272347164573, + "clip_ratio/low_mean": 0.000592039981711423, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013305672109709121, + "completions/clipped_ratio": 0.0926339285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3465.0, + "completions/mean_length": 925.0803833007812, + "completions/mean_terminated_length": 601.35791015625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 1.10498687664042, + "grad_norm": 0.2671792209148407, + "learning_rate": 1e-06, + "loss": -0.047, + "num_tokens": 71807517.0, + "reward": 0.6261160969734192, + "reward_std": 0.15503577888011932, + "rewards/verify_math_reward/mean": 0.6261160969734192, + "rewards/verify_math_reward/std": 0.48410359025001526, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0022711845631420147, + "clip_ratio/high_mean": 0.000970492787018884, + "clip_ratio/low_mean": 0.0007535526510764612, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017240454253624193, + "epoch": 1.1073199183435405, + "grad_norm": 0.2036406397819519, + "learning_rate": 1e-06, + "loss": -0.0471, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0024492949814884923, + "clip_ratio/high_mean": 0.0009340437973150983, + "clip_ratio/low_mean": 0.0009111156687140465, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018451594587531872, + "epoch": 1.109652960046661, + "grad_norm": 0.1986713707447052, + "learning_rate": 1e-06, + "loss": -0.0473, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0025339678759337403, + "clip_ratio/high_mean": 0.000984592094027903, + "clip_ratio/low_mean": 0.0010988305184582714, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020834225942962803, + "epoch": 1.1119860017497813, + "grad_norm": 0.17704980075359344, + "learning_rate": 1e-06, + "loss": -0.0474, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0026995309817721136, + "clip_ratio/high_mean": 0.0010780071061162744, + "clip_ratio/low_mean": 0.0005695656591342413, + "clip_ratio/low_min": 9.328357919002883e-06, + "clip_ratio/region_mean": 0.0016475728079967666, + "completions/clipped_ratio": 0.1160714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 1060.641845703125, + "completions/mean_terminated_length": 662.059326171875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 1.1143190434529018, + "grad_norm": 0.26159003376960754, + "learning_rate": 1e-06, + "loss": -0.0551, + "num_tokens": 72428508.0, + "reward": 0.5703125, + "reward_std": 0.2054987996816635, + "rewards/verify_math_reward/mean": 0.5703125, + "rewards/verify_math_reward/std": 0.49530795216560364, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0029697629215661436, + "clip_ratio/high_mean": 0.0012597079148690682, + "clip_ratio/low_mean": 0.0008002120393939549, + "clip_ratio/low_min": 2.153192508558277e-05, + "clip_ratio/region_mean": 0.0020599199342541397, + "epoch": 1.1166520851560222, + "grad_norm": 0.21325941383838654, + "learning_rate": 1e-06, + "loss": -0.0554, + "step": 478 + }, + { + "clip_ratio/high_max": 0.002954839372250717, + "clip_ratio/high_mean": 0.0012681778644036967, + "clip_ratio/low_mean": 0.0009019180924951797, + "clip_ratio/low_min": 1.0257672329316847e-05, + "clip_ratio/region_mean": 0.0021700959332520142, + "epoch": 1.1189851268591426, + "grad_norm": 0.2004043459892273, + "learning_rate": 1e-06, + "loss": -0.0555, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0030141370443743654, + "clip_ratio/high_mean": 0.0012908005010103807, + "clip_ratio/low_mean": 0.0012389729054120835, + "clip_ratio/low_min": 3.0773018806939945e-05, + "clip_ratio/region_mean": 0.002529773410060443, + "epoch": 1.121318168562263, + "grad_norm": 0.18501633405685425, + "learning_rate": 1e-06, + "loss": -0.0557, + "step": 480 + }, + { + "clip_ratio/high_max": 0.002389666642557131, + "clip_ratio/high_mean": 0.0007511532421631273, + "clip_ratio/low_mean": 0.0006754465671292564, + "clip_ratio/low_min": 1.4688601368106902e-05, + "clip_ratio/region_mean": 0.0014265998215705622, + "completions/clipped_ratio": 0.1339285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3584.0, + "completions/mean_length": 1110.560302734375, + "completions/mean_terminated_length": 648.894287109375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 1.1236512102653835, + "grad_norm": 0.27346184849739075, + "learning_rate": 1e-06, + "loss": -0.0267, + "num_tokens": 73029250.0, + "reward": 0.5212053656578064, + "reward_std": 0.15770326554775238, + "rewards/verify_math_reward/mean": 0.5212053656578064, + "rewards/verify_math_reward/std": 0.49982911348342896, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0030613427225034684, + "clip_ratio/high_mean": 0.0010165627209062222, + "clip_ratio/low_mean": 0.0008935898658819497, + "clip_ratio/low_min": 5.8754405472427607e-05, + "clip_ratio/region_mean": 0.001910152604978066, + "epoch": 1.125984251968504, + "grad_norm": 0.2034171223640442, + "learning_rate": 1e-06, + "loss": -0.0271, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0025401779857929796, + "clip_ratio/high_mean": 0.0009724038245622069, + "clip_ratio/low_mean": 0.0010596897463983623, + "clip_ratio/low_min": 2.6282590624759905e-05, + "clip_ratio/region_mean": 0.0020320935982454102, + "epoch": 1.1283172936716244, + "grad_norm": 0.1914975643157959, + "learning_rate": 1e-06, + "loss": -0.0272, + "step": 483 + }, + { + "clip_ratio/high_max": 0.003259218472521752, + "clip_ratio/high_mean": 0.0010540255243540742, + "clip_ratio/low_mean": 0.001212483823110233, + "clip_ratio/low_min": 4.4065804104320705e-05, + "clip_ratio/region_mean": 0.0022665093711111695, + "epoch": 1.1306503353747448, + "grad_norm": 0.22389191389083862, + "learning_rate": 1e-06, + "loss": -0.0273, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0021428443360491656, + "clip_ratio/high_mean": 0.0008868116710800678, + "clip_ratio/low_mean": 0.0006069602713978384, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014937718988221604, + "completions/clipped_ratio": 0.1037946428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2847.0, + "completions/mean_length": 958.5491333007812, + "completions/mean_terminated_length": 595.1830444335938, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 1.1329833770778652, + "grad_norm": 0.2494707852602005, + "learning_rate": 1e-06, + "loss": -0.0271, + "num_tokens": 73585750.0, + "reward": 0.640625, + "reward_std": 0.16871324181556702, + "rewards/verify_math_reward/mean": 0.640625, + "rewards/verify_math_reward/std": 0.48008525371551514, + "step": 485 + }, + { + "clip_ratio/high_max": 0.002578438194177579, + "clip_ratio/high_mean": 0.0010743611092038918, + "clip_ratio/low_mean": 0.0008911755230656127, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019655366268125363, + "epoch": 1.1353164187809857, + "grad_norm": 0.2744096517562866, + "learning_rate": 1e-06, + "loss": -0.0273, + "step": 486 + }, + { + "clip_ratio/high_max": 0.003209933507605456, + "clip_ratio/high_mean": 0.001222103772306582, + "clip_ratio/low_mean": 0.0009726033858896699, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021947071436443366, + "epoch": 1.137649460484106, + "grad_norm": 0.29313361644744873, + "learning_rate": 1e-06, + "loss": -0.0275, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0030149374433676712, + "clip_ratio/high_mean": 0.001209986636240501, + "clip_ratio/low_mean": 0.0013064717168163043, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0025164583494188264, + "epoch": 1.1399825021872265, + "grad_norm": 0.19453810155391693, + "learning_rate": 1e-06, + "loss": -0.0277, + "step": 488 + }, + { + "clip_ratio/high_max": 0.002624071013997309, + "clip_ratio/high_mean": 0.0009458125041419407, + "clip_ratio/low_mean": 0.0006403016905096592, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015861142237554304, + "completions/clipped_ratio": 0.1227678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3931.0, + "completions/mean_length": 1115.6239013671875, + "completions/mean_terminated_length": 698.5228881835938, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 1.142315543890347, + "grad_norm": 0.24084880948066711, + "learning_rate": 1e-06, + "loss": -0.0431, + "num_tokens": 74224877.0, + "reward": 0.5948660969734192, + "reward_std": 0.1831425130367279, + "rewards/verify_math_reward/mean": 0.5948660969734192, + "rewards/verify_math_reward/std": 0.49119213223457336, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0029585360607597977, + "clip_ratio/high_mean": 0.0010731726724770851, + "clip_ratio/low_mean": 0.0008471020973956911, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019202747862436809, + "epoch": 1.1446485855934676, + "grad_norm": 0.21253973245620728, + "learning_rate": 1e-06, + "loss": -0.0433, + "step": 490 + }, + { + "clip_ratio/high_max": 0.003574863643734716, + "clip_ratio/high_mean": 0.0012558270209410693, + "clip_ratio/low_mean": 0.0009229206416421221, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002178747636207845, + "epoch": 1.1469816272965878, + "grad_norm": 0.1913764625787735, + "learning_rate": 1e-06, + "loss": -0.0435, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0031777483600308187, + "clip_ratio/high_mean": 0.0012014315143460408, + "clip_ratio/low_mean": 0.001132662691816222, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023340941916103475, + "epoch": 1.1493146689997085, + "grad_norm": 0.18061964213848114, + "learning_rate": 1e-06, + "loss": -0.0435, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0020101828704355285, + "clip_ratio/high_mean": 0.0007751140274194768, + "clip_ratio/low_mean": 0.00044571058697329136, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012208246043883264, + "completions/clipped_ratio": 0.1004464285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3894.0, + "completions/mean_length": 1004.3839721679688, + "completions/mean_terminated_length": 659.166259765625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 1.151647710702829, + "grad_norm": 0.22036553919315338, + "learning_rate": 1e-06, + "loss": -0.0327, + "num_tokens": 74845677.0, + "reward": 0.6350446939468384, + "reward_std": 0.14948049187660217, + "rewards/verify_math_reward/mean": 0.6350446343421936, + "rewards/verify_math_reward/std": 0.4816865026950836, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0022590553999179974, + "clip_ratio/high_mean": 0.0009336157945654122, + "clip_ratio/low_mean": 0.0006544762891280698, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001588092025485821, + "epoch": 1.1539807524059493, + "grad_norm": 0.18672379851341248, + "learning_rate": 1e-06, + "loss": -0.0329, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0024380449067393783, + "clip_ratio/high_mean": 0.0010128747944690986, + "clip_ratio/low_mean": 0.0007495823338103946, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017624571337364614, + "epoch": 1.1563137941090698, + "grad_norm": 0.22866769134998322, + "learning_rate": 1e-06, + "loss": -0.033, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0025246197037631646, + "clip_ratio/high_mean": 0.0009747100557433441, + "clip_ratio/low_mean": 0.000957716760240146, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019324268505442888, + "epoch": 1.1586468358121902, + "grad_norm": 0.16688209772109985, + "learning_rate": 1e-06, + "loss": -0.0331, + "step": 496 + }, + { + "clip_ratio/high_max": 0.002160491898393957, + "clip_ratio/high_mean": 0.0007186983257270185, + "clip_ratio/low_mean": 0.0007738161439192481, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014925144787412137, + "completions/clipped_ratio": 0.1573660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3946.0, + "completions/mean_length": 1202.20654296875, + "completions/mean_terminated_length": 661.7761840820312, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 1.1609798775153106, + "grad_norm": 0.24846456944942474, + "learning_rate": 1e-06, + "loss": -0.0506, + "num_tokens": 75435662.0, + "reward": 0.5234375, + "reward_std": 0.13575772941112518, + "rewards/verify_math_reward/mean": 0.5234375, + "rewards/verify_math_reward/std": 0.49972933530807495, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0026769454598252196, + "clip_ratio/high_mean": 0.0008719501320229028, + "clip_ratio/low_mean": 0.00085258467879612, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001724534831737401, + "epoch": 1.163312919218431, + "grad_norm": 0.22556813061237335, + "learning_rate": 1e-06, + "loss": -0.0507, + "step": 498 + }, + { + "clip_ratio/high_max": 0.003002273697347846, + "clip_ratio/high_mean": 0.0009335558734164806, + "clip_ratio/low_mean": 0.0011308597659080988, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020644156757043675, + "epoch": 1.1656459609215515, + "grad_norm": 0.18102315068244934, + "learning_rate": 1e-06, + "loss": -0.0509, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0029013332896283828, + "clip_ratio/high_mean": 0.0009207827542923042, + "clip_ratio/low_mean": 0.0012869851234427188, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022077678513596766, + "epoch": 1.167979002624672, + "grad_norm": 0.16825012862682343, + "learning_rate": 1e-06, + "loss": -0.051, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0017726223231875338, + "clip_ratio/high_mean": 0.0006842501479695784, + "clip_ratio/low_mean": 0.00048078535655804444, + "clip_ratio/low_min": 8.508031896781176e-06, + "clip_ratio/region_mean": 0.0011650355045276228, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3630.0, + "completions/mean_length": 984.5201416015625, + "completions/mean_terminated_length": 632.7875366210938, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 1.1703120443277923, + "grad_norm": 0.24229595065116882, + "learning_rate": 1e-06, + "loss": -0.0404, + "num_tokens": 76033536.0, + "reward": 0.6104910969734192, + "reward_std": 0.15480685234069824, + "rewards/verify_math_reward/mean": 0.6104910969734192, + "rewards/verify_math_reward/std": 0.48791125416755676, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0024169529206119478, + "clip_ratio/high_mean": 0.0010063247500511352, + "clip_ratio/low_mean": 0.0006739567579643335, + "clip_ratio/low_min": 2.0223264073138125e-05, + "clip_ratio/region_mean": 0.0016802815152914263, + "epoch": 1.1726450860309128, + "grad_norm": 0.2216462790966034, + "learning_rate": 1e-06, + "loss": -0.0406, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0023699274461250752, + "clip_ratio/high_mean": 0.0009497327700955793, + "clip_ratio/low_mean": 0.0008968771453510271, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018466099500074051, + "epoch": 1.1749781277340332, + "grad_norm": 0.1699136197566986, + "learning_rate": 1e-06, + "loss": -0.0408, + "step": 503 + }, + { + "clip_ratio/high_max": 0.002410533001238946, + "clip_ratio/high_mean": 0.0009472103756706929, + "clip_ratio/low_mean": 0.0010392735275672749, + "clip_ratio/low_min": 2.691065674298443e-05, + "clip_ratio/region_mean": 0.001986483890505042, + "epoch": 1.1773111694371536, + "grad_norm": 0.18011756241321564, + "learning_rate": 1e-06, + "loss": -0.0409, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0017886991263367236, + "clip_ratio/high_mean": 0.0007002090806054184, + "clip_ratio/low_mean": 0.0007003072787483688, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001400516335706925, + "completions/clipped_ratio": 0.1261160714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2693.0, + "completions/mean_length": 1028.407470703125, + "completions/mean_terminated_length": 585.702392578125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 1.179644211140274, + "grad_norm": 0.27300500869750977, + "learning_rate": 1e-06, + "loss": -0.0383, + "num_tokens": 76574741.0, + "reward": 0.6350446939468384, + "reward_std": 0.16183637082576752, + "rewards/verify_math_reward/mean": 0.6350446343421936, + "rewards/verify_math_reward/std": 0.481686532497406, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0026057456525450107, + "clip_ratio/high_mean": 0.0010075661084556486, + "clip_ratio/low_mean": 0.001023779108436429, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020313452623668127, + "epoch": 1.1819772528433945, + "grad_norm": 0.2584632933139801, + "learning_rate": 1e-06, + "loss": -0.0386, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0027077028789790347, + "clip_ratio/high_mean": 0.0010219967316515977, + "clip_ratio/low_mean": 0.0011785405349655775, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022005372447893023, + "epoch": 1.184310294546515, + "grad_norm": 0.21871411800384521, + "learning_rate": 1e-06, + "loss": -0.0388, + "step": 507 + }, + { + "clip_ratio/high_max": 0.002569734613643959, + "clip_ratio/high_mean": 0.0009420320166100282, + "clip_ratio/low_mean": 0.0014475440202659229, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023895760532468557, + "epoch": 1.1866433362496354, + "grad_norm": 0.21576610207557678, + "learning_rate": 1e-06, + "loss": -0.0389, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0026465314876986668, + "clip_ratio/high_mean": 0.0011771513309213333, + "clip_ratio/low_mean": 0.0005883481826458592, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017654995463090017, + "completions/clipped_ratio": 0.1194196428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2788.0, + "completions/mean_length": 1036.7489013671875, + "completions/mean_terminated_length": 621.8694458007812, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 1.188976377952756, + "grad_norm": 0.2937774658203125, + "learning_rate": 1e-06, + "loss": -0.0722, + "num_tokens": 77157316.0, + "reward": 0.5792410969734192, + "reward_std": 0.19799211621284485, + "rewards/verify_math_reward/mean": 0.5792410969734192, + "rewards/verify_math_reward/std": 0.49395665526390076, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0030839903265587054, + "clip_ratio/high_mean": 0.0013798255131405313, + "clip_ratio/low_mean": 0.0008560552560084034, + "clip_ratio/low_min": 3.3068783523049206e-05, + "clip_ratio/region_mean": 0.0022358807545970194, + "epoch": 1.1913094196558764, + "grad_norm": 0.25443971157073975, + "learning_rate": 1e-06, + "loss": -0.0725, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0034403286554152146, + "clip_ratio/high_mean": 0.0014830293293925934, + "clip_ratio/low_mean": 0.001141271259257337, + "clip_ratio/low_min": 4.0102662751451135e-05, + "clip_ratio/region_mean": 0.0026243005704600364, + "epoch": 1.1936424613589969, + "grad_norm": 0.19449834525585175, + "learning_rate": 1e-06, + "loss": -0.0727, + "step": 511 + }, + { + "clip_ratio/high_max": 0.003032838176295627, + "clip_ratio/high_mean": 0.001370453795971116, + "clip_ratio/low_mean": 0.0011973431919614086, + "clip_ratio/low_min": 7.449344411725178e-05, + "clip_ratio/region_mean": 0.0025677970043034293, + "epoch": 1.1959755030621173, + "grad_norm": 0.20635251700878143, + "learning_rate": 1e-06, + "loss": -0.0728, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0024143223017745186, + "clip_ratio/high_mean": 0.0009924522710207384, + "clip_ratio/low_mean": 0.0007360769614024321, + "clip_ratio/low_min": 1.7293858036282472e-05, + "clip_ratio/region_mean": 0.001728529248794075, + "completions/clipped_ratio": 0.1171875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3750.0, + "completions/mean_length": 1060.2254638671875, + "completions/mean_terminated_length": 657.2465209960938, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 1.1983085447652377, + "grad_norm": 0.2654973864555359, + "learning_rate": 1e-06, + "loss": -0.0526, + "num_tokens": 77770286.0, + "reward": 0.5680803656578064, + "reward_std": 0.1995958387851715, + "rewards/verify_math_reward/mean": 0.5680803656578064, + "rewards/verify_math_reward/std": 0.4956200420856476, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0024228927577496506, + "clip_ratio/high_mean": 0.0010833551295945654, + "clip_ratio/low_mean": 0.0010713286137615796, + "clip_ratio/low_min": 9.454540486331098e-05, + "clip_ratio/region_mean": 0.0021546837378991768, + "epoch": 1.2006415864683582, + "grad_norm": 0.22051560878753662, + "learning_rate": 1e-06, + "loss": -0.053, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0030360161181306466, + "clip_ratio/high_mean": 0.0011675423866108758, + "clip_ratio/low_mean": 0.0012180573357909452, + "clip_ratio/low_min": 0.00010423983803775627, + "clip_ratio/region_mean": 0.002385599735134747, + "epoch": 1.2029746281714786, + "grad_norm": 0.20579476654529572, + "learning_rate": 1e-06, + "loss": -0.0531, + "step": 515 + }, + { + "clip_ratio/high_max": 0.002736599933996331, + "clip_ratio/high_mean": 0.0011807907576439902, + "clip_ratio/low_mean": 0.0014078295971557964, + "clip_ratio/low_min": 9.662347838457208e-05, + "clip_ratio/region_mean": 0.0025886203802656382, + "epoch": 1.205307669874599, + "grad_norm": 0.2896462082862854, + "learning_rate": 1e-06, + "loss": -0.0533, + "step": 516 + }, + { + "clip_ratio/high_max": 0.002434489084407687, + "clip_ratio/high_mean": 0.001015359470329713, + "clip_ratio/low_mean": 0.0005827424547533155, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001598101960553322, + "completions/clipped_ratio": 0.1294642857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4044.0, + "completions/mean_length": 1130.09375, + "completions/mean_terminated_length": 689.01025390625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 1.2076407115777195, + "grad_norm": 0.46300458908081055, + "learning_rate": 1e-06, + "loss": -0.0369, + "num_tokens": 78400858.0, + "reward": 0.5658482313156128, + "reward_std": 0.17829221487045288, + "rewards/verify_math_reward/mean": 0.5658482313156128, + "rewards/verify_math_reward/std": 0.49592188000679016, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0025428009321331047, + "clip_ratio/high_mean": 0.0011331242694723187, + "clip_ratio/low_mean": 0.0012312159096836695, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002364340180065483, + "epoch": 1.20997375328084, + "grad_norm": 0.6013938784599304, + "learning_rate": 1e-06, + "loss": -0.0371, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0031117841717787087, + "clip_ratio/high_mean": 0.001198860463773599, + "clip_ratio/low_mean": 0.002348972273466643, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0035478327226883266, + "epoch": 1.2123067949839603, + "grad_norm": 0.3071836829185486, + "learning_rate": 1e-06, + "loss": -0.0375, + "step": 519 + }, + { + "clip_ratio/high_max": 0.002819619156070985, + "clip_ratio/high_mean": 0.0012652650621021166, + "clip_ratio/low_mean": 0.002037731521340902, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003302996585262008, + "epoch": 1.2146398366870808, + "grad_norm": 2.0215115547180176, + "learning_rate": 1e-06, + "loss": -0.0354, + "step": 520 + }, + { + "clip_ratio/high_max": 0.002378325239988044, + "clip_ratio/high_mean": 0.0008747008068894502, + "clip_ratio/low_mean": 0.0004809233500964183, + "clip_ratio/low_min": 1.7477628716733307e-05, + "clip_ratio/region_mean": 0.001355624190182425, + "completions/clipped_ratio": 0.0814732142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3359.0, + "completions/mean_length": 879.5569458007812, + "completions/mean_terminated_length": 594.2588500976562, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 1.2169728783902012, + "grad_norm": 0.29543742537498474, + "learning_rate": 1e-06, + "loss": -0.0413, + "num_tokens": 78975085.0, + "reward": 0.5892857313156128, + "reward_std": 0.15221282839775085, + "rewards/verify_math_reward/mean": 0.5892857313156128, + "rewards/verify_math_reward/std": 0.49223825335502625, + "step": 521 + }, + { + "clip_ratio/high_max": 0.002576551472884603, + "clip_ratio/high_mean": 0.0009847943820204819, + "clip_ratio/low_mean": 0.0007131154156923003, + "clip_ratio/low_min": 2.307124486833345e-05, + "clip_ratio/region_mean": 0.0016979097708826885, + "epoch": 1.2193059200933216, + "grad_norm": 0.22369630634784698, + "learning_rate": 1e-06, + "loss": -0.0416, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0028981081777601503, + "clip_ratio/high_mean": 0.0010226434642390814, + "clip_ratio/low_mean": 0.0008572595120313053, + "clip_ratio/low_min": 3.495525743346661e-05, + "clip_ratio/region_mean": 0.0018799030040099751, + "epoch": 1.221638961796442, + "grad_norm": 0.19105824828147888, + "learning_rate": 1e-06, + "loss": -0.0418, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0028165603798697703, + "clip_ratio/high_mean": 0.0010105444034707034, + "clip_ratio/low_mean": 0.0009608435766494949, + "clip_ratio/low_min": 2.4029219275689684e-05, + "clip_ratio/region_mean": 0.0019713879737537354, + "epoch": 1.2239720034995625, + "grad_norm": 0.2389257699251175, + "learning_rate": 1e-06, + "loss": -0.0418, + "step": 524 + }, + { + "clip_ratio/high_max": 0.002592427917988971, + "clip_ratio/high_mean": 0.0009008916185848648, + "clip_ratio/low_mean": 0.00042396093704155646, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013248525792732835, + "completions/clipped_ratio": 0.1082589285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3717.0, + "completions/mean_length": 969.5078735351562, + "completions/mean_terminated_length": 589.9461669921875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 1.226305045202683, + "grad_norm": 0.22978700697422028, + "learning_rate": 1e-06, + "loss": -0.0676, + "num_tokens": 79531948.0, + "reward": 0.6071428656578064, + "reward_std": 0.151575967669487, + "rewards/verify_math_reward/mean": 0.6071428656578064, + "rewards/verify_math_reward/std": 0.48865827918052673, + "step": 525 + }, + { + "clip_ratio/high_max": 0.003308030150947161, + "clip_ratio/high_mean": 0.0011520420393935638, + "clip_ratio/low_mean": 0.0006933599233889254, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018454019736964256, + "epoch": 1.2286380869058036, + "grad_norm": 0.1991288810968399, + "learning_rate": 1e-06, + "loss": -0.0679, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0037966473028063774, + "clip_ratio/high_mean": 0.0012030818907078356, + "clip_ratio/low_mean": 0.0008858901655912632, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002088972025376279, + "epoch": 1.2309711286089238, + "grad_norm": 0.1985888034105301, + "learning_rate": 1e-06, + "loss": -0.068, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0035153744684066623, + "clip_ratio/high_mean": 0.0011972285283263773, + "clip_ratio/low_mean": 0.0010277893488819245, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002225017909950111, + "epoch": 1.2333041703120444, + "grad_norm": 0.18501132726669312, + "learning_rate": 1e-06, + "loss": -0.0681, + "step": 528 + }, + { + "clip_ratio/high_max": 0.002251866906590294, + "clip_ratio/high_mean": 0.0007997916636668378, + "clip_ratio/low_mean": 0.0006696858235955006, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014694774727104232, + "completions/clipped_ratio": 0.1450892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2754.0, + "completions/mean_length": 1112.25341796875, + "completions/mean_terminated_length": 605.8733520507812, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 1.2356372120151649, + "grad_norm": 0.29438626766204834, + "learning_rate": 1e-06, + "loss": -0.0569, + "num_tokens": 80086919.0, + "reward": 0.5390625, + "reward_std": 0.16435301303863525, + "rewards/verify_math_reward/mean": 0.5390625, + "rewards/verify_math_reward/std": 0.4987502098083496, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0027898812768398784, + "clip_ratio/high_mean": 0.0010612102978484472, + "clip_ratio/low_mean": 0.0008457083604298532, + "clip_ratio/low_min": 1.3377568393480033e-05, + "clip_ratio/region_mean": 0.0019069186528213322, + "epoch": 1.2379702537182853, + "grad_norm": 0.22571726143360138, + "learning_rate": 1e-06, + "loss": -0.0572, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0031235070564434864, + "clip_ratio/high_mean": 0.0010874585277633741, + "clip_ratio/low_mean": 0.0010993516680173343, + "clip_ratio/low_min": 1.3377568393480033e-05, + "clip_ratio/region_mean": 0.002186810153943952, + "epoch": 1.2403032954214057, + "grad_norm": 0.18684718012809753, + "learning_rate": 1e-06, + "loss": -0.0574, + "step": 531 + }, + { + "clip_ratio/high_max": 0.002868225252314005, + "clip_ratio/high_mean": 0.0010452830683789216, + "clip_ratio/low_mean": 0.0012705202316283248, + "clip_ratio/low_min": 3.6909448681399226e-05, + "clip_ratio/region_mean": 0.0023158032781793736, + "epoch": 1.2426363371245261, + "grad_norm": 0.20576512813568115, + "learning_rate": 1e-06, + "loss": -0.0574, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0024350592066184618, + "clip_ratio/high_mean": 0.0010799655137816444, + "clip_ratio/low_mean": 0.0005255854721326614, + "clip_ratio/low_min": 1.6657782907714136e-05, + "clip_ratio/region_mean": 0.001605550991371274, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3299.0, + "completions/mean_length": 976.3772583007812, + "completions/mean_terminated_length": 593.2656860351562, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 1.2449693788276466, + "grad_norm": 0.3275110423564911, + "learning_rate": 1e-06, + "loss": -0.0671, + "num_tokens": 80641609.0, + "reward": 0.6428571939468384, + "reward_std": 0.18265356123447418, + "rewards/verify_math_reward/mean": 0.6428571343421936, + "rewards/verify_math_reward/std": 0.4794250428676605, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0031015579297672957, + "clip_ratio/high_mean": 0.0013122145355737302, + "clip_ratio/low_mean": 0.000775938459810277, + "clip_ratio/low_min": 2.710908665903844e-05, + "clip_ratio/region_mean": 0.002088153036311269, + "epoch": 1.247302420530767, + "grad_norm": 0.281019926071167, + "learning_rate": 1e-06, + "loss": -0.0673, + "step": 534 + }, + { + "clip_ratio/high_max": 0.002716930066526402, + "clip_ratio/high_mean": 0.0012175443043815903, + "clip_ratio/low_mean": 0.0009726612188387662, + "clip_ratio/low_min": 1.6657782907714136e-05, + "clip_ratio/region_mean": 0.0021902055596001446, + "epoch": 1.2496354622338874, + "grad_norm": 0.20063097774982452, + "learning_rate": 1e-06, + "loss": -0.0675, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0030896394382580183, + "clip_ratio/high_mean": 0.0012307345277804416, + "clip_ratio/low_mean": 0.0011316145337332273, + "clip_ratio/low_min": 1.355454332951922e-05, + "clip_ratio/region_mean": 0.0023623489978490397, + "epoch": 1.2519685039370079, + "grad_norm": 0.2279004156589508, + "learning_rate": 1e-06, + "loss": -0.0675, + "step": 536 + }, + { + "clip_ratio/high_max": 0.002269060983962845, + "clip_ratio/high_mean": 0.0009541396066197194, + "clip_ratio/low_mean": 0.0005999795075695147, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015541191351076122, + "completions/clipped_ratio": 0.0970982142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3890.0, + "completions/mean_length": 934.7801513671875, + "completions/mean_terminated_length": 594.822021484375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 1.2543015456401283, + "grad_norm": 0.2848651111125946, + "learning_rate": 1e-06, + "loss": -0.0739, + "num_tokens": 81206836.0, + "reward": 0.6696428656578064, + "reward_std": 0.17649206519126892, + "rewards/verify_math_reward/mean": 0.6696428656578064, + "rewards/verify_math_reward/std": 0.47060438990592957, + "step": 537 + }, + { + "clip_ratio/high_max": 0.00296455933857942, + "clip_ratio/high_mean": 0.0012060076842317358, + "clip_ratio/low_mean": 0.0008301700763695408, + "clip_ratio/low_min": 1.385194991598837e-05, + "clip_ratio/region_mean": 0.0020361777787911706, + "epoch": 1.2566345873432487, + "grad_norm": 0.3101758360862732, + "learning_rate": 1e-06, + "loss": -0.0741, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0026383453950984403, + "clip_ratio/high_mean": 0.0011584382482396904, + "clip_ratio/low_mean": 0.0010574414545772015, + "clip_ratio/low_min": 3.907471182174049e-05, + "clip_ratio/region_mean": 0.0022158796709845774, + "epoch": 1.2589676290463692, + "grad_norm": 0.21931804716587067, + "learning_rate": 1e-06, + "loss": -0.0743, + "step": 539 + }, + { + "clip_ratio/high_max": 0.002541691101214383, + "clip_ratio/high_mean": 0.001116355419071624, + "clip_ratio/low_mean": 0.0012633948963411967, + "clip_ratio/low_min": 2.770389983197674e-05, + "clip_ratio/region_mean": 0.0023797503308742307, + "epoch": 1.2613006707494896, + "grad_norm": 0.3209487199783325, + "learning_rate": 1e-06, + "loss": -0.0744, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0023388528879877413, + "clip_ratio/high_mean": 0.0008931090860642144, + "clip_ratio/low_mean": 0.0006677318378933705, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001560840908496175, + "completions/clipped_ratio": 0.1127232142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3303.0, + "completions/mean_length": 992.5625610351562, + "completions/mean_terminated_length": 598.289306640625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 1.26363371245261, + "grad_norm": 0.24479156732559204, + "learning_rate": 1e-06, + "loss": -0.0438, + "num_tokens": 81774076.0, + "reward": 0.5770089626312256, + "reward_std": 0.1669466197490692, + "rewards/verify_math_reward/mean": 0.5770089030265808, + "rewards/verify_math_reward/std": 0.4943099617958069, + "step": 541 + }, + { + "clip_ratio/high_max": 0.002695808510907227, + "clip_ratio/high_mean": 0.0011374612786312355, + "clip_ratio/low_mean": 0.0007252048162627034, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018626660930749495, + "epoch": 1.2659667541557305, + "grad_norm": 0.22449366748332977, + "learning_rate": 1e-06, + "loss": -0.0439, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0026009647990576923, + "clip_ratio/high_mean": 0.0010876012001972413, + "clip_ratio/low_mean": 0.0010571295370027656, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021447307444759645, + "epoch": 1.268299795858851, + "grad_norm": 0.1874256134033203, + "learning_rate": 1e-06, + "loss": -0.0441, + "step": 543 + }, + { + "clip_ratio/high_max": 0.002826059189828811, + "clip_ratio/high_mean": 0.0011576665710890666, + "clip_ratio/low_mean": 0.0011849075144709786, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002342574080103077, + "epoch": 1.2706328375619713, + "grad_norm": 0.17332985997200012, + "learning_rate": 1e-06, + "loss": -0.0442, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0022264186009124387, + "clip_ratio/high_mean": 0.001017978900563321, + "clip_ratio/low_mean": 0.000610701295954641, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016286801983369514, + "completions/clipped_ratio": 0.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2789.0, + "completions/mean_length": 1057.01904296875, + "completions/mean_terminated_length": 622.8787841796875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 1.272965879265092, + "grad_norm": 0.3083045184612274, + "learning_rate": 1e-06, + "loss": -0.05, + "num_tokens": 82348637.0, + "reward": 0.5770089626312256, + "reward_std": 0.18761083483695984, + "rewards/verify_math_reward/mean": 0.5770089030265808, + "rewards/verify_math_reward/std": 0.4943099319934845, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0033035823944373988, + "clip_ratio/high_mean": 0.0012792956767953, + "clip_ratio/low_mean": 0.0010433476818434428, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023226433695526794, + "epoch": 1.2752989209682122, + "grad_norm": 0.2574704885482788, + "learning_rate": 1e-06, + "loss": -0.0503, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0032511619647266343, + "clip_ratio/high_mean": 0.0012776143485098146, + "clip_ratio/low_mean": 0.0011831686570076272, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002460783020069357, + "epoch": 1.2776319626713328, + "grad_norm": 0.23422737419605255, + "learning_rate": 1e-06, + "loss": -0.0504, + "step": 547 + }, + { + "clip_ratio/high_max": 0.002982414713187609, + "clip_ratio/high_mean": 0.0011638829091680236, + "clip_ratio/low_mean": 0.001405524933943525, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002569407799455803, + "epoch": 1.2799650043744533, + "grad_norm": 0.2075558751821518, + "learning_rate": 1e-06, + "loss": -0.0506, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0018218839504697826, + "clip_ratio/high_mean": 0.0006480422534878016, + "clip_ratio/low_mean": 0.0006002505042488337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012482927522796672, + "completions/clipped_ratio": 0.1450892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3967.0, + "completions/mean_length": 1145.2366943359375, + "completions/mean_terminated_length": 644.454345703125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 1.2822980460775737, + "grad_norm": 0.23727191984653473, + "learning_rate": 1e-06, + "loss": -0.0327, + "num_tokens": 82934601.0, + "reward": 0.5502232313156128, + "reward_std": 0.13722196221351624, + "rewards/verify_math_reward/mean": 0.5502232313156128, + "rewards/verify_math_reward/std": 0.49774909019470215, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0020135488175583305, + "clip_ratio/high_mean": 0.000720536571861885, + "clip_ratio/low_mean": 0.000795816269601346, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015163528587436303, + "epoch": 1.2846310877806941, + "grad_norm": 0.20960785448551178, + "learning_rate": 1e-06, + "loss": -0.0328, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0021363516370911384, + "clip_ratio/high_mean": 0.0007712140754847496, + "clip_ratio/low_mean": 0.001017558195599122, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017887722860905342, + "epoch": 1.2869641294838146, + "grad_norm": 0.1890256106853485, + "learning_rate": 1e-06, + "loss": -0.0329, + "step": 551 + }, + { + "clip_ratio/high_max": 0.002385011419391958, + "clip_ratio/high_mean": 0.0007658743115825928, + "clip_ratio/low_mean": 0.0012119299499318004, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019778042478719726, + "epoch": 1.289297171186935, + "grad_norm": 0.16167958080768585, + "learning_rate": 1e-06, + "loss": -0.033, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0025781944350455888, + "clip_ratio/high_mean": 0.001074661959137302, + "clip_ratio/low_mean": 0.0006483473480329849, + "clip_ratio/low_min": 1.0751763511507306e-05, + "clip_ratio/region_mean": 0.0017230093289981596, + "completions/clipped_ratio": 0.1618303571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3884.0, + "completions/mean_length": 1220.677490234375, + "completions/mean_terminated_length": 665.52197265625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 1.2916302128900554, + "grad_norm": 0.2890176475048065, + "learning_rate": 1e-06, + "loss": -0.0684, + "num_tokens": 83530192.0, + "reward": 0.515625, + "reward_std": 0.19622044265270233, + "rewards/verify_math_reward/mean": 0.515625, + "rewards/verify_math_reward/std": 0.5000349283218384, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0029420962018775754, + "clip_ratio/high_mean": 0.0012198284202895593, + "clip_ratio/low_mean": 0.000763745676522376, + "clip_ratio/low_min": 2.1606361769954674e-05, + "clip_ratio/region_mean": 0.0019835740604321472, + "epoch": 1.2939632545931758, + "grad_norm": 0.24971313774585724, + "learning_rate": 1e-06, + "loss": -0.0685, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0034426571655785665, + "clip_ratio/high_mean": 0.001382934980938444, + "clip_ratio/low_mean": 0.001083816277969163, + "clip_ratio/low_min": 1.0751763511507306e-05, + "clip_ratio/region_mean": 0.0024667512843734585, + "epoch": 1.2962962962962963, + "grad_norm": 0.21363192796707153, + "learning_rate": 1e-06, + "loss": -0.0688, + "step": 555 + }, + { + "clip_ratio/high_max": 0.003283892227045726, + "clip_ratio/high_mean": 0.0013199301065469626, + "clip_ratio/low_mean": 0.0012499797667260282, + "clip_ratio/low_min": 2.1244051822577603e-05, + "clip_ratio/region_mean": 0.0025699098987388425, + "epoch": 1.2986293379994167, + "grad_norm": 0.22296246886253357, + "learning_rate": 1e-06, + "loss": -0.0689, + "step": 556 + }, + { + "clip_ratio/high_max": 0.001826815896492917, + "clip_ratio/high_mean": 0.0006240852071641712, + "clip_ratio/low_mean": 0.0005783656320090813, + "clip_ratio/low_min": 9.656983820605092e-06, + "clip_ratio/region_mean": 0.0012024508323520422, + "completions/clipped_ratio": 0.1517857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3930.0, + "completions/mean_length": 1182.9554443359375, + "completions/mean_terminated_length": 661.6737060546875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 1.3009623797025371, + "grad_norm": 0.2738368511199951, + "learning_rate": 1e-06, + "loss": -0.054, + "num_tokens": 84126120.0, + "reward": 0.5089285969734192, + "reward_std": 0.1451537311077118, + "rewards/verify_math_reward/mean": 0.5089285969734192, + "rewards/verify_math_reward/std": 0.5001994967460632, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0022162531822687015, + "clip_ratio/high_mean": 0.0007980015361681581, + "clip_ratio/low_mean": 0.0007888012933108257, + "clip_ratio/low_min": 1.0867674973269459e-05, + "clip_ratio/region_mean": 0.0015868028131080791, + "epoch": 1.3032954214056576, + "grad_norm": 0.2366650104522705, + "learning_rate": 1e-06, + "loss": -0.0543, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0025047057497431524, + "clip_ratio/high_mean": 0.0008618533920525806, + "clip_ratio/low_mean": 0.001016271615299047, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001878125018265564, + "epoch": 1.305628463108778, + "grad_norm": 0.21774974465370178, + "learning_rate": 1e-06, + "loss": -0.0544, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0023650196781090926, + "clip_ratio/high_mean": 0.0008386210593016585, + "clip_ratio/low_mean": 0.0012069545991835184, + "clip_ratio/low_min": 6.15359385847114e-05, + "clip_ratio/region_mean": 0.0020455756784940604, + "epoch": 1.3079615048118984, + "grad_norm": 0.18776483833789825, + "learning_rate": 1e-06, + "loss": -0.0545, + "step": 560 + }, + { + "clip_ratio/high_max": 0.002210908722190652, + "clip_ratio/high_mean": 0.000765536697144853, + "clip_ratio/low_mean": 0.000620098414401582, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013856350979040144, + "completions/clipped_ratio": 0.1696428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4046.0, + "completions/mean_length": 1280.943115234375, + "completions/mean_terminated_length": 705.8239135742188, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 1.3102945465150189, + "grad_norm": 0.24883843958377838, + "learning_rate": 1e-06, + "loss": -0.0668, + "num_tokens": 84745181.0, + "reward": 0.5167410969734192, + "reward_std": 0.15518662333488464, + "rewards/verify_math_reward/mean": 0.5167410969734192, + "rewards/verify_math_reward/std": 0.4999987483024597, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0027370128518668935, + "clip_ratio/high_mean": 0.0010297183680449962, + "clip_ratio/low_mean": 0.0008404065083595924, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018701249246078078, + "epoch": 1.3126275882181395, + "grad_norm": 0.19404339790344238, + "learning_rate": 1e-06, + "loss": -0.067, + "step": 562 + }, + { + "clip_ratio/high_max": 0.002672463200724451, + "clip_ratio/high_mean": 0.0009291456617575022, + "clip_ratio/low_mean": 0.0010373450859333389, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001966490781342145, + "epoch": 1.3149606299212597, + "grad_norm": 0.2075706124305725, + "learning_rate": 1e-06, + "loss": -0.0672, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0024174200952984393, + "clip_ratio/high_mean": 0.0009365485079797509, + "clip_ratio/low_mean": 0.0011528040340635926, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002089352550683543, + "epoch": 1.3172936716243804, + "grad_norm": 0.173910990357399, + "learning_rate": 1e-06, + "loss": -0.0672, + "step": 564 + }, + { + "clip_ratio/high_max": 0.002502191076928284, + "clip_ratio/high_mean": 0.0008761862118262798, + "clip_ratio/low_mean": 0.0008554907271900447, + "clip_ratio/low_min": 1.070755570253823e-05, + "clip_ratio/region_mean": 0.0017316769881290384, + "completions/clipped_ratio": 0.1774553571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4043.0, + "completions/mean_length": 1279.318115234375, + "completions/mean_terminated_length": 671.6485595703125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 1.3196267133275008, + "grad_norm": 0.2694673240184784, + "learning_rate": 1e-06, + "loss": -0.0587, + "num_tokens": 85333594.0, + "reward": 0.4955357313156128, + "reward_std": 0.16450457274913788, + "rewards/verify_math_reward/mean": 0.4955357015132904, + "rewards/verify_math_reward/std": 0.500259280204773, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0036288581541157328, + "clip_ratio/high_mean": 0.0013075614733679686, + "clip_ratio/low_mean": 0.001007825490887626, + "clip_ratio/low_min": 2.141511140507646e-05, + "clip_ratio/region_mean": 0.0023153869624366052, + "epoch": 1.3219597550306212, + "grad_norm": 0.27903032302856445, + "learning_rate": 1e-06, + "loss": -0.0589, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0031396789054269902, + "clip_ratio/high_mean": 0.0010936912440229207, + "clip_ratio/low_mean": 0.0011650597607513191, + "clip_ratio/low_min": 1.2437811165000312e-05, + "clip_ratio/region_mean": 0.002258751028421102, + "epoch": 1.3242927967337417, + "grad_norm": 0.22430910170078278, + "learning_rate": 1e-06, + "loss": -0.0592, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0029859956121072173, + "clip_ratio/high_mean": 0.0010755592793429969, + "clip_ratio/low_mean": 0.0014766222884645686, + "clip_ratio/low_min": 3.731343167601153e-05, + "clip_ratio/region_mean": 0.0025521815186948515, + "epoch": 1.326625838436862, + "grad_norm": 0.20644904673099518, + "learning_rate": 1e-06, + "loss": -0.0593, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0024077963025774807, + "clip_ratio/high_mean": 0.0009524892739136703, + "clip_ratio/low_mean": 0.0007456240709871054, + "clip_ratio/low_min": 2.887827122322051e-05, + "clip_ratio/region_mean": 0.0016981133230729029, + "completions/clipped_ratio": 0.1741071428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4002.0, + "completions/mean_length": 1243.03466796875, + "completions/mean_terminated_length": 641.5986938476562, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 1.3289588801399825, + "grad_norm": 0.3015615940093994, + "learning_rate": 1e-06, + "loss": -0.0808, + "num_tokens": 85899945.0, + "reward": 0.5011160969734192, + "reward_std": 0.19895894825458527, + "rewards/verify_math_reward/mean": 0.5011160969734192, + "rewards/verify_math_reward/std": 0.5002779960632324, + "step": 569 + }, + { + "clip_ratio/high_max": 0.003500665996398311, + "clip_ratio/high_mean": 0.0012464672872738447, + "clip_ratio/low_mean": 0.0010257575977448141, + "clip_ratio/low_min": 0.00012356816660030745, + "clip_ratio/region_mean": 0.002272224875923712, + "epoch": 1.331291921843103, + "grad_norm": 0.24528682231903076, + "learning_rate": 1e-06, + "loss": -0.0811, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0038307572322082706, + "clip_ratio/high_mean": 0.0013789985459879972, + "clip_ratio/low_mean": 0.0012182924365333747, + "clip_ratio/low_min": 0.00011780763452406973, + "clip_ratio/region_mean": 0.002597291022539139, + "epoch": 1.3336249635462234, + "grad_norm": 0.208794504404068, + "learning_rate": 1e-06, + "loss": -0.0814, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0031483758939430118, + "clip_ratio/high_mean": 0.0011578826415643562, + "clip_ratio/low_mean": 0.0014242229008232243, + "clip_ratio/low_min": 0.00011548066322575323, + "clip_ratio/region_mean": 0.002582105516921729, + "epoch": 1.3359580052493438, + "grad_norm": 0.2046995311975479, + "learning_rate": 1e-06, + "loss": -0.0814, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0023555047664558515, + "clip_ratio/high_mean": 0.0008531126677553402, + "clip_ratio/low_mean": 0.0005902969119233603, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014434096083277836, + "completions/clipped_ratio": 0.1752232142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3835.0, + "completions/mean_length": 1285.040283203125, + "completions/mean_terminated_length": 687.8538208007812, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 1.3382910469524643, + "grad_norm": 0.25017184019088745, + "learning_rate": 1e-06, + "loss": -0.0753, + "num_tokens": 86504485.0, + "reward": 0.5256696939468384, + "reward_std": 0.17757472395896912, + "rewards/verify_math_reward/mean": 0.5256696343421936, + "rewards/verify_math_reward/std": 0.4996195137500763, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0028082246353733353, + "clip_ratio/high_mean": 0.0010514356654311996, + "clip_ratio/low_mean": 0.0007836565755496849, + "clip_ratio/low_min": 4.149826600041706e-05, + "clip_ratio/region_mean": 0.0018350922473473474, + "epoch": 1.3406240886555847, + "grad_norm": 0.22260355949401855, + "learning_rate": 1e-06, + "loss": -0.0755, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0034009277733275667, + "clip_ratio/high_mean": 0.0011136180037283339, + "clip_ratio/low_mean": 0.001009279998470447, + "clip_ratio/low_min": 5.6302465964108706e-05, + "clip_ratio/region_mean": 0.0021228979603620246, + "epoch": 1.3429571303587051, + "grad_norm": 0.2014068365097046, + "learning_rate": 1e-06, + "loss": -0.0758, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0031681032451160718, + "clip_ratio/high_mean": 0.0011202151526958914, + "clip_ratio/low_mean": 0.0011742266851797467, + "clip_ratio/low_min": 6.622609544137958e-05, + "clip_ratio/region_mean": 0.0022944418524275534, + "epoch": 1.3452901720618256, + "grad_norm": 0.2062731385231018, + "learning_rate": 1e-06, + "loss": -0.0758, + "step": 576 + }, + { + "clip_ratio/high_max": 0.00190445312182419, + "clip_ratio/high_mean": 0.0007755427886877442, + "clip_ratio/low_mean": 0.0005309752777975518, + "clip_ratio/low_min": 1.972231075342279e-05, + "clip_ratio/region_mean": 0.0013065180792182218, + "completions/clipped_ratio": 0.1852678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3200.0, + "completions/mean_length": 1232.1640625, + "completions/mean_terminated_length": 580.9356079101562, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 1.347623213764946, + "grad_norm": 0.30050161480903625, + "learning_rate": 1e-06, + "loss": -0.0672, + "num_tokens": 87015224.0, + "reward": 0.559151828289032, + "reward_std": 0.15045617520809174, + "rewards/verify_math_reward/mean": 0.5591517686843872, + "rewards/verify_math_reward/std": 0.496766060590744, + "step": 577 + }, + { + "clip_ratio/high_max": 0.002414862574369181, + "clip_ratio/high_mean": 0.0009709475780255161, + "clip_ratio/low_mean": 0.0007054359193716664, + "clip_ratio/low_min": 1.972231075342279e-05, + "clip_ratio/region_mean": 0.001676383486483246, + "epoch": 1.3499562554680664, + "grad_norm": 0.23338362574577332, + "learning_rate": 1e-06, + "loss": -0.0674, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0023903432229417376, + "clip_ratio/high_mean": 0.0009181600580632221, + "clip_ratio/low_mean": 0.0009727856850076932, + "clip_ratio/low_min": 3.944462150684558e-05, + "clip_ratio/region_mean": 0.0018909457576228306, + "epoch": 1.352289297171187, + "grad_norm": 0.21429312229156494, + "learning_rate": 1e-06, + "loss": -0.0675, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0023471144886570983, + "clip_ratio/high_mean": 0.0009947415128408466, + "clip_ratio/low_mean": 0.0010963070253637852, + "clip_ratio/low_min": 3.944462150684558e-05, + "clip_ratio/region_mean": 0.0020910485764034092, + "epoch": 1.3546223388743073, + "grad_norm": 0.19566044211387634, + "learning_rate": 1e-06, + "loss": -0.0676, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0024408697499893606, + "clip_ratio/high_mean": 0.000832719366371748, + "clip_ratio/low_mean": 0.0005201030976422771, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013528224553738255, + "completions/clipped_ratio": 0.2087053571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4034.0, + "completions/mean_length": 1369.0457763671875, + "completions/mean_terminated_length": 649.8067626953125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 1.356955380577428, + "grad_norm": 0.2526213526725769, + "learning_rate": 1e-06, + "loss": -0.0483, + "num_tokens": 87579321.0, + "reward": 0.535714328289032, + "reward_std": 0.1345217078924179, + "rewards/verify_math_reward/mean": 0.5357142686843872, + "rewards/verify_math_reward/std": 0.4990014135837555, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0030322665261337534, + "clip_ratio/high_mean": 0.0010206833885604283, + "clip_ratio/low_mean": 0.0007519806695199804, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017726640435284935, + "epoch": 1.3592884222805481, + "grad_norm": 0.25519487261772156, + "learning_rate": 1e-06, + "loss": -0.0485, + "step": 582 + }, + { + "clip_ratio/high_max": 0.003515000549668912, + "clip_ratio/high_mean": 0.0011885368476214353, + "clip_ratio/low_mean": 0.0009024976052387501, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002091034493787447, + "epoch": 1.3616214639836688, + "grad_norm": 0.4738200902938843, + "learning_rate": 1e-06, + "loss": -0.0487, + "step": 583 + }, + { + "clip_ratio/high_max": 0.003024574962182669, + "clip_ratio/high_mean": 0.0009989466548177006, + "clip_ratio/low_mean": 0.0010811063275468769, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002080053040117491, + "epoch": 1.3639545056867892, + "grad_norm": 0.2041773796081543, + "learning_rate": 1e-06, + "loss": -0.0488, + "step": 584 + }, + { + "clip_ratio/high_max": 0.002207355821155943, + "clip_ratio/high_mean": 0.0008487261347909225, + "clip_ratio/low_mean": 0.00045651231687315885, + "clip_ratio/low_min": 1.7443482647649944e-05, + "clip_ratio/region_mean": 0.0013052384383627214, + "completions/clipped_ratio": 0.2042410714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4059.0, + "completions/mean_length": 1383.12841796875, + "completions/mean_terminated_length": 686.8372802734375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 1.3662875473899097, + "grad_norm": 0.28928208351135254, + "learning_rate": 1e-06, + "loss": -0.0712, + "num_tokens": 88156644.0, + "reward": 0.5055803656578064, + "reward_std": 0.16517673432826996, + "rewards/verify_math_reward/mean": 0.5055803656578064, + "rewards/verify_math_reward/std": 0.5002480745315552, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0027591592152020894, + "clip_ratio/high_mean": 0.0010814143024617806, + "clip_ratio/low_mean": 0.0007400381391562405, + "clip_ratio/low_min": 1.3213531019573566e-05, + "clip_ratio/region_mean": 0.0018214524934592191, + "epoch": 1.36862058909303, + "grad_norm": 0.23145802319049835, + "learning_rate": 1e-06, + "loss": -0.0714, + "step": 586 + }, + { + "clip_ratio/high_max": 0.002570676511822967, + "clip_ratio/high_mean": 0.0010192344198003411, + "clip_ratio/low_mean": 0.0008793568467808655, + "clip_ratio/low_min": 2.402613972662948e-05, + "clip_ratio/region_mean": 0.0018985912993230158, + "epoch": 1.3709536307961505, + "grad_norm": 0.20198185741901398, + "learning_rate": 1e-06, + "loss": -0.0716, + "step": 587 + }, + { + "clip_ratio/high_max": 0.002756901980319526, + "clip_ratio/high_mean": 0.0010476693350938149, + "clip_ratio/low_mean": 0.0010282289204042172, + "clip_ratio/low_min": 5.233044794294983e-05, + "clip_ratio/region_mean": 0.0020758982536790427, + "epoch": 1.373286672499271, + "grad_norm": 0.19303227961063385, + "learning_rate": 1e-06, + "loss": -0.0717, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0030537584316334687, + "clip_ratio/high_mean": 0.0011682229196594562, + "clip_ratio/low_mean": 0.0005520252943824744, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001720248234050814, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3188.0, + "completions/mean_length": 1431.64404296875, + "completions/mean_terminated_length": 685.624267578125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 1.3756197142023914, + "grad_norm": 0.299399733543396, + "learning_rate": 1e-06, + "loss": -0.1017, + "num_tokens": 88720029.0, + "reward": 0.53125, + "reward_std": 0.19854412972927094, + "rewards/verify_math_reward/mean": 0.53125, + "rewards/verify_math_reward/std": 0.4993011951446533, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0032559526662225835, + "clip_ratio/high_mean": 0.0012779856951965485, + "clip_ratio/low_mean": 0.0008215541784011293, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020995398444938473, + "epoch": 1.3779527559055118, + "grad_norm": 0.254035621881485, + "learning_rate": 1e-06, + "loss": -0.102, + "step": 590 + }, + { + "clip_ratio/high_max": 0.003467716960585676, + "clip_ratio/high_mean": 0.0013532105476770084, + "clip_ratio/low_mean": 0.001074977066309657, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002428187603072729, + "epoch": 1.3802857976086322, + "grad_norm": 0.22269730269908905, + "learning_rate": 1e-06, + "loss": -0.1022, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0038947790453676134, + "clip_ratio/high_mean": 0.0014983424262027256, + "clip_ratio/low_mean": 0.0012840630570281064, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0027824054850498214, + "epoch": 1.3826188393117527, + "grad_norm": 0.22764192521572113, + "learning_rate": 1e-06, + "loss": -0.1023, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0021997439180267975, + "clip_ratio/high_mean": 0.0010206915576418396, + "clip_ratio/low_mean": 0.00040406626385447453, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001424757832864998, + "completions/clipped_ratio": 0.1573660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3147.0, + "completions/mean_length": 1177.2935791015625, + "completions/mean_terminated_length": 632.2106323242188, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 1.384951881014873, + "grad_norm": 0.3221670985221863, + "learning_rate": 1e-06, + "loss": -0.0642, + "num_tokens": 89281972.0, + "reward": 0.6116071939468384, + "reward_std": 0.163072407245636, + "rewards/verify_math_reward/mean": 0.6116071343421936, + "rewards/verify_math_reward/std": 0.48765692114830017, + "step": 593 + }, + { + "clip_ratio/high_max": 0.003074386069783941, + "clip_ratio/high_mean": 0.0013346814266697038, + "clip_ratio/low_mean": 0.0006285099052547594, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001963191360118799, + "epoch": 1.3872849227179935, + "grad_norm": 0.26880836486816406, + "learning_rate": 1e-06, + "loss": -0.0646, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0026226171030430123, + "clip_ratio/high_mean": 0.0012232904336997308, + "clip_ratio/low_mean": 0.0008326363977175788, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002055926750472281, + "epoch": 1.389617964421114, + "grad_norm": 0.2298322468996048, + "learning_rate": 1e-06, + "loss": -0.0647, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0031332553407992236, + "clip_ratio/high_mean": 0.0012834487861255184, + "clip_ratio/low_mean": 0.0010289407300660969, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023123895152821206, + "epoch": 1.3919510061242344, + "grad_norm": 0.17754845321178436, + "learning_rate": 1e-06, + "loss": -0.0648, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0015846236638026312, + "clip_ratio/high_mean": 0.0005263043115064647, + "clip_ratio/low_mean": 0.0002776564401756332, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008039607509999769, + "completions/clipped_ratio": 0.1573660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3086.0, + "completions/mean_length": 1184.8817138671875, + "completions/mean_terminated_length": 641.2158813476562, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 1.3942840478273548, + "grad_norm": 0.17424309253692627, + "learning_rate": 1e-06, + "loss": -0.055, + "num_tokens": 89862346.0, + "reward": 0.5926339626312256, + "reward_std": 0.10697808116674423, + "rewards/verify_math_reward/mean": 0.5926339030265808, + "rewards/verify_math_reward/std": 0.49161845445632935, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0016562970013183076, + "clip_ratio/high_mean": 0.0005918571487200097, + "clip_ratio/low_mean": 0.00033543918425493757, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000927296341615147, + "epoch": 1.3966170895304755, + "grad_norm": 0.15606385469436646, + "learning_rate": 1e-06, + "loss": -0.0551, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0018177629535784945, + "clip_ratio/high_mean": 0.0006014393620716874, + "clip_ratio/low_mean": 0.000457512957950712, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010589522898953874, + "epoch": 1.3989501312335957, + "grad_norm": 0.15585005283355713, + "learning_rate": 1e-06, + "loss": -0.0551, + "step": 599 + }, + { + "clip_ratio/high_max": 0.001933376139277243, + "clip_ratio/high_mean": 0.0006321645703337708, + "clip_ratio/low_mean": 0.0005467807463901408, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011789453419623896, + "epoch": 1.4012831729367163, + "grad_norm": 0.14662864804267883, + "learning_rate": 1e-06, + "loss": -0.0552, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0021389451321738306, + "clip_ratio/high_mean": 0.0008527110221621115, + "clip_ratio/low_mean": 0.0005508000540430658, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001403511079843156, + "completions/clipped_ratio": 0.1852678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3851.0, + "completions/mean_length": 1268.9832763671875, + "completions/mean_terminated_length": 626.12744140625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 1.4036162146398368, + "grad_norm": 0.3296069800853729, + "learning_rate": 1e-06, + "loss": -0.0579, + "num_tokens": 90416619.0, + "reward": 0.5424107313156128, + "reward_std": 0.15852880477905273, + "rewards/verify_math_reward/mean": 0.5424107313156128, + "rewards/verify_math_reward/std": 0.4984763264656067, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0026905527847702615, + "clip_ratio/high_mean": 0.001033009248203598, + "clip_ratio/low_mean": 0.0008065595188782027, + "clip_ratio/low_min": 1.9512955987011082e-05, + "clip_ratio/region_mean": 0.0018395687729935162, + "epoch": 1.4059492563429572, + "grad_norm": 0.30061817169189453, + "learning_rate": 1e-06, + "loss": -0.0581, + "step": 602 + }, + { + "clip_ratio/high_max": 0.002843020934960805, + "clip_ratio/high_mean": 0.0009820305567700416, + "clip_ratio/low_mean": 0.0010391084992988908, + "clip_ratio/low_min": 5.8049536164617166e-05, + "clip_ratio/region_mean": 0.002021139021962881, + "epoch": 1.4082822980460776, + "grad_norm": 0.37664586305618286, + "learning_rate": 1e-06, + "loss": -0.0584, + "step": 603 + }, + { + "clip_ratio/high_max": 0.002560520537372213, + "clip_ratio/high_mean": 0.0009429810361325508, + "clip_ratio/low_mean": 0.001355129446437786, + "clip_ratio/low_min": 3.9025911974022165e-05, + "clip_ratio/region_mean": 0.0022981104848440737, + "epoch": 1.410615339749198, + "grad_norm": 0.22192707657814026, + "learning_rate": 1e-06, + "loss": -0.0584, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0021960718695481773, + "clip_ratio/high_mean": 0.0009144201030721888, + "clip_ratio/low_mean": 0.0006570868190465262, + "clip_ratio/low_min": 3.0317729397211224e-05, + "clip_ratio/region_mean": 0.001571506931213662, + "completions/clipped_ratio": 0.1328125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3814.0, + "completions/mean_length": 1080.6138916015625, + "completions/mean_terminated_length": 618.7979736328125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 1.4129483814523185, + "grad_norm": 0.32067176699638367, + "learning_rate": 1e-06, + "loss": -0.0518, + "num_tokens": 90981905.0, + "reward": 0.6473214626312256, + "reward_std": 0.16586002707481384, + "rewards/verify_math_reward/mean": 0.6473214030265808, + "rewards/verify_math_reward/std": 0.47807061672210693, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0025493590328551363, + "clip_ratio/high_mean": 0.0010450735744598205, + "clip_ratio/low_mean": 0.0009328416072094114, + "clip_ratio/low_min": 6.464625766966492e-05, + "clip_ratio/region_mean": 0.0019779151625698432, + "epoch": 1.415281423155439, + "grad_norm": 0.23323972523212433, + "learning_rate": 1e-06, + "loss": -0.0521, + "step": 606 + }, + { + "clip_ratio/high_max": 0.002687867498025298, + "clip_ratio/high_mean": 0.0010876679752982454, + "clip_ratio/low_mean": 0.0010807789094542386, + "clip_ratio/low_min": 4.5476594095816836e-05, + "clip_ratio/region_mean": 0.002168446888390463, + "epoch": 1.4176144648585594, + "grad_norm": 0.20053008198738098, + "learning_rate": 1e-06, + "loss": -0.0523, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0026580548619676847, + "clip_ratio/high_mean": 0.001002402326776064, + "clip_ratio/low_mean": 0.0013172924864193192, + "clip_ratio/low_min": 0.00010611205652821809, + "clip_ratio/region_mean": 0.0023196948823169805, + "epoch": 1.4199475065616798, + "grad_norm": 0.20511440932750702, + "learning_rate": 1e-06, + "loss": -0.0523, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0022464193098130636, + "clip_ratio/high_mean": 0.0008555043514206773, + "clip_ratio/low_mean": 0.0005546764232349233, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014101807100814767, + "completions/clipped_ratio": 0.1529017857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3897.0, + "completions/mean_length": 1150.9732666015625, + "completions/mean_terminated_length": 619.3939208984375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 1.4222805482648002, + "grad_norm": 0.3018054962158203, + "learning_rate": 1e-06, + "loss": -0.0708, + "num_tokens": 91541249.0, + "reward": 0.5613839626312256, + "reward_std": 0.16461403667926788, + "rewards/verify_math_reward/mean": 0.5613839030265808, + "rewards/verify_math_reward/std": 0.496494859457016, + "step": 609 + }, + { + "clip_ratio/high_max": 0.002809393459756393, + "clip_ratio/high_mean": 0.001057904966728529, + "clip_ratio/low_mean": 0.0007229856068988738, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017808905977290124, + "epoch": 1.4246135899679206, + "grad_norm": 0.2314707636833191, + "learning_rate": 1e-06, + "loss": -0.071, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0028084185032639652, + "clip_ratio/high_mean": 0.0010672877251636237, + "clip_ratio/low_mean": 0.0008877732725522947, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001955060964974109, + "epoch": 1.426946631671041, + "grad_norm": 0.4472660720348358, + "learning_rate": 1e-06, + "loss": -0.0711, + "step": 611 + }, + { + "clip_ratio/high_max": 0.00276724090508651, + "clip_ratio/high_mean": 0.0010477610740053933, + "clip_ratio/low_mean": 0.0010973095450026449, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021450706335599534, + "epoch": 1.4292796733741615, + "grad_norm": 0.2138577103614807, + "learning_rate": 1e-06, + "loss": -0.0712, + "step": 612 + }, + { + "clip_ratio/high_max": 0.002117222800734453, + "clip_ratio/high_mean": 0.0008917890409065876, + "clip_ratio/low_mean": 0.0007474229569197632, + "clip_ratio/low_min": 1.2669775060203392e-05, + "clip_ratio/region_mean": 0.0016392120087402873, + "completions/clipped_ratio": 0.1462053571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3247.0, + "completions/mean_length": 1161.48779296875, + "completions/mean_terminated_length": 658.9765014648438, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 1.431612715077282, + "grad_norm": 0.2706722617149353, + "learning_rate": 1e-06, + "loss": -0.0491, + "num_tokens": 92127894.0, + "reward": 0.5803571939468384, + "reward_std": 0.1737132966518402, + "rewards/verify_math_reward/mean": 0.5803571343421936, + "rewards/verify_math_reward/std": 0.4937761425971985, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0027357905491953716, + "clip_ratio/high_mean": 0.001020476996927755, + "clip_ratio/low_mean": 0.0009552073897793889, + "clip_ratio/low_min": 1.2669775060203392e-05, + "clip_ratio/region_mean": 0.001975684383069165, + "epoch": 1.4339457567804024, + "grad_norm": 0.2501928210258484, + "learning_rate": 1e-06, + "loss": -0.0492, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0029026706470176578, + "clip_ratio/high_mean": 0.0010749668326752726, + "clip_ratio/low_mean": 0.0011807225237134844, + "clip_ratio/low_min": 2.5339550120406784e-05, + "clip_ratio/region_mean": 0.002255689279991202, + "epoch": 1.436278798483523, + "grad_norm": 0.2090653032064438, + "learning_rate": 1e-06, + "loss": -0.0495, + "step": 615 + }, + { + "clip_ratio/high_max": 0.002555416082032025, + "clip_ratio/high_mean": 0.0009618200056138448, + "clip_ratio/low_mean": 0.0014350209457916208, + "clip_ratio/low_min": 2.5572831873432733e-05, + "clip_ratio/region_mean": 0.002396840944129508, + "epoch": 1.4386118401866432, + "grad_norm": 0.2010267674922943, + "learning_rate": 1e-06, + "loss": -0.0495, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0022100016140029766, + "clip_ratio/high_mean": 0.0008261815728474176, + "clip_ratio/low_mean": 0.00046243821452662814, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012886197873740457, + "completions/clipped_ratio": 0.1238839285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4061.0, + "completions/mean_length": 1030.7645263671875, + "completions/mean_terminated_length": 597.3363037109375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 1.4409448818897639, + "grad_norm": 0.30851060152053833, + "learning_rate": 1e-06, + "loss": -0.0658, + "num_tokens": 92681131.0, + "reward": 0.6428571939468384, + "reward_std": 0.1633780300617218, + "rewards/verify_math_reward/mean": 0.6428571343421936, + "rewards/verify_math_reward/std": 0.4794250428676605, + "step": 617 + }, + { + "clip_ratio/high_max": 0.002810432379192207, + "clip_ratio/high_mean": 0.0011620204331848072, + "clip_ratio/low_mean": 0.0006768962748537888, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018389166871202178, + "epoch": 1.443277923592884, + "grad_norm": 0.2598080039024353, + "learning_rate": 1e-06, + "loss": -0.0661, + "step": 618 + }, + { + "clip_ratio/high_max": 0.002908035930886399, + "clip_ratio/high_mean": 0.0011905909195775166, + "clip_ratio/low_mean": 0.0008262346800620435, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020168255796306767, + "epoch": 1.4456109652960047, + "grad_norm": 0.19690977036952972, + "learning_rate": 1e-06, + "loss": -0.0663, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0025863999107968993, + "clip_ratio/high_mean": 0.0010566708187980112, + "clip_ratio/low_mean": 0.0009493513525740127, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002006022143177688, + "epoch": 1.4479440069991252, + "grad_norm": 0.23134976625442505, + "learning_rate": 1e-06, + "loss": -0.0663, + "step": 620 + }, + { + "clip_ratio/high_max": 0.002198130972828949, + "clip_ratio/high_mean": 0.0007040815144137014, + "clip_ratio/low_mean": 0.000685114953739685, + "clip_ratio/low_min": 1.1968594662903342e-05, + "clip_ratio/region_mean": 0.0013891964590584394, + "completions/clipped_ratio": 0.1350446428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3505.0, + "completions/mean_length": 1098.3035888671875, + "completions/mean_terminated_length": 630.276123046875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 1.4502770487022456, + "grad_norm": 0.3217826187610626, + "learning_rate": 1e-06, + "loss": -0.0273, + "num_tokens": 93258899.0, + "reward": 0.5680803656578064, + "reward_std": 0.1669451892375946, + "rewards/verify_math_reward/mean": 0.5680803656578064, + "rewards/verify_math_reward/std": 0.4956200420856476, + "step": 621 + }, + { + "clip_ratio/high_max": 0.002298268344020471, + "clip_ratio/high_mean": 0.0008164960509020602, + "clip_ratio/low_mean": 0.0009055617429112317, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017220578229171224, + "epoch": 1.452610090405366, + "grad_norm": 0.21622058749198914, + "learning_rate": 1e-06, + "loss": -0.0276, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0024621934062452056, + "clip_ratio/high_mean": 0.0007997809279913781, + "clip_ratio/low_mean": 0.0011010635334969265, + "clip_ratio/low_min": 1.1968594662903342e-05, + "clip_ratio/region_mean": 0.001900844436022453, + "epoch": 1.4549431321084865, + "grad_norm": 0.20084120333194733, + "learning_rate": 1e-06, + "loss": -0.0278, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0023159858101280406, + "clip_ratio/high_mean": 0.0008225301498896442, + "clip_ratio/low_mean": 0.0012968940864084288, + "clip_ratio/low_min": 1.1968594662903342e-05, + "clip_ratio/region_mean": 0.0021194242290221155, + "epoch": 1.457276173811607, + "grad_norm": 0.1809324324131012, + "learning_rate": 1e-06, + "loss": -0.0279, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0020966012234566733, + "clip_ratio/high_mean": 0.0008217469148803502, + "clip_ratio/low_mean": 0.0005623789793389733, + "clip_ratio/low_min": 1.354573032585904e-05, + "clip_ratio/region_mean": 0.0013841258842148818, + "completions/clipped_ratio": 0.1685267857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3344.0, + "completions/mean_length": 1279.501220703125, + "completions/mean_terminated_length": 708.6403198242188, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 1.4596092155147273, + "grad_norm": 0.2877887785434723, + "learning_rate": 1e-06, + "loss": -0.0728, + "num_tokens": 93878340.0, + "reward": 0.5446428656578064, + "reward_std": 0.16566601395606995, + "rewards/verify_math_reward/mean": 0.5446428656578064, + "rewards/verify_math_reward/std": 0.4982811510562897, + "step": 625 + }, + { + "clip_ratio/high_max": 0.00275848429009784, + "clip_ratio/high_mean": 0.0010338401443732437, + "clip_ratio/low_mean": 0.0008302076239488088, + "clip_ratio/low_min": 3.299023592262529e-05, + "clip_ratio/region_mean": 0.0018640477137523703, + "epoch": 1.4619422572178478, + "grad_norm": 0.20693060755729675, + "learning_rate": 1e-06, + "loss": -0.0731, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0028259688333491795, + "clip_ratio/high_mean": 0.001058114408806432, + "clip_ratio/low_mean": 0.0009656888905738015, + "clip_ratio/low_min": 5.362869706004858e-05, + "clip_ratio/region_mean": 0.0020238032666384242, + "epoch": 1.4642752989209682, + "grad_norm": 0.1813107430934906, + "learning_rate": 1e-06, + "loss": -0.0733, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0028315739036770537, + "clip_ratio/high_mean": 0.0009843532243394293, + "clip_ratio/low_mean": 0.0011155190331919584, + "clip_ratio/low_min": 9.688676800578833e-05, + "clip_ratio/region_mean": 0.0020998722466174513, + "epoch": 1.4666083406240886, + "grad_norm": 0.17822666466236115, + "learning_rate": 1e-06, + "loss": -0.0733, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0019807805183518212, + "clip_ratio/high_mean": 0.0008326768383994931, + "clip_ratio/low_mean": 0.00034694392343226355, + "clip_ratio/low_min": 1.3994625987834297e-05, + "clip_ratio/region_mean": 0.001179620736365905, + "completions/clipped_ratio": 0.1729910714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3853.0, + "completions/mean_length": 1252.6484375, + "completions/mean_terminated_length": 657.88525390625, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "epoch": 1.468941382327209, + "grad_norm": 0.3137721121311188, + "learning_rate": 1e-06, + "loss": -0.0701, + "num_tokens": 94456689.0, + "reward": 0.5290178656578064, + "reward_std": 0.15665017068386078, + "rewards/verify_math_reward/mean": 0.5290178656578064, + "rewards/verify_math_reward/std": 0.49943602085113525, + "step": 629 + }, + { + "clip_ratio/high_max": 0.00227245543646859, + "clip_ratio/high_mean": 0.0009395831893925788, + "clip_ratio/low_mean": 0.0005534355395866442, + "clip_ratio/low_min": 2.0550196495605633e-05, + "clip_ratio/region_mean": 0.0014930187171557918, + "epoch": 1.4712744240303295, + "grad_norm": 0.29060402512550354, + "learning_rate": 1e-06, + "loss": -0.0703, + "step": 630 + }, + { + "clip_ratio/high_max": 0.00282437232817756, + "clip_ratio/high_mean": 0.0010493311092432123, + "clip_ratio/low_mean": 0.0006986428397794953, + "clip_ratio/low_min": 6.8500658017001115e-06, + "clip_ratio/region_mean": 0.0017479739581176545, + "epoch": 1.47360746573345, + "grad_norm": 0.18267522752285004, + "learning_rate": 1e-06, + "loss": -0.0705, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0026695308988564648, + "clip_ratio/high_mean": 0.0010461520869284868, + "clip_ratio/low_mean": 0.0008833047941152472, + "clip_ratio/low_min": 4.795045970240608e-05, + "clip_ratio/region_mean": 0.001929456844663946, + "epoch": 1.4759405074365703, + "grad_norm": 0.24536316096782684, + "learning_rate": 1e-06, + "loss": -0.0705, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0018891414983954746, + "clip_ratio/high_mean": 0.0008695932247064775, + "clip_ratio/low_mean": 0.0004981261481589172, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013677193237526808, + "completions/clipped_ratio": 0.1350446428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3659.0, + "completions/mean_length": 1132.751220703125, + "completions/mean_terminated_length": 670.1019897460938, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 1.4782735491396908, + "grad_norm": 0.3137512803077698, + "learning_rate": 1e-06, + "loss": -0.0655, + "num_tokens": 95054562.0, + "reward": 0.5881696939468384, + "reward_std": 0.17784713208675385, + "rewards/verify_math_reward/mean": 0.5881696343421936, + "rewards/verify_math_reward/std": 0.4924395978450775, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0026375512752565555, + "clip_ratio/high_mean": 0.0011333199290675111, + "clip_ratio/low_mean": 0.0007825907032383839, + "clip_ratio/low_min": 2.1226014723652042e-05, + "clip_ratio/region_mean": 0.0019159107105224393, + "epoch": 1.4806065908428114, + "grad_norm": 0.23053784668445587, + "learning_rate": 1e-06, + "loss": -0.0658, + "step": 634 + }, + { + "clip_ratio/high_max": 0.002540815963584464, + "clip_ratio/high_mean": 0.0011436544646130642, + "clip_ratio/low_mean": 0.0009427386703464435, + "clip_ratio/low_min": 1.0613007361826021e-05, + "clip_ratio/region_mean": 0.0020863931204075925, + "epoch": 1.4829396325459316, + "grad_norm": 0.2093677967786789, + "learning_rate": 1e-06, + "loss": -0.0659, + "step": 635 + }, + { + "clip_ratio/high_max": 0.002613451928482391, + "clip_ratio/high_mean": 0.0011095456502516754, + "clip_ratio/low_mean": 0.0011048078104067827, + "clip_ratio/low_min": 4.2452029447304085e-05, + "clip_ratio/region_mean": 0.0022143534224596806, + "epoch": 1.4852726742490523, + "grad_norm": 0.1752631515264511, + "learning_rate": 1e-06, + "loss": -0.0661, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0017787703473004512, + "clip_ratio/high_mean": 0.0007677474686715868, + "clip_ratio/low_mean": 0.000416842095546599, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001184589524200419, + "completions/clipped_ratio": 0.1495535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3903.0, + "completions/mean_length": 1191.4888916015625, + "completions/mean_terminated_length": 680.7218017578125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 1.4876057159521727, + "grad_norm": 0.20887115597724915, + "learning_rate": 1e-06, + "loss": -0.0506, + "num_tokens": 95663376.0, + "reward": 0.5848214626312256, + "reward_std": 0.15582671761512756, + "rewards/verify_math_reward/mean": 0.5848214030265808, + "rewards/verify_math_reward/std": 0.49302801489830017, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0020016869930259418, + "clip_ratio/high_mean": 0.0008568723087591934, + "clip_ratio/low_mean": 0.0005421767809821176, + "clip_ratio/low_min": 1.433815123164095e-05, + "clip_ratio/region_mean": 0.0013990490406285971, + "epoch": 1.4899387576552932, + "grad_norm": 0.24404728412628174, + "learning_rate": 1e-06, + "loss": -0.0507, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0021760556192020886, + "clip_ratio/high_mean": 0.0009449577919440344, + "clip_ratio/low_mean": 0.0007495746449421858, + "clip_ratio/low_min": 4.012785393570084e-05, + "clip_ratio/region_mean": 0.0016945324241532944, + "epoch": 1.4922717993584136, + "grad_norm": 0.190024733543396, + "learning_rate": 1e-06, + "loss": -0.0509, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0021634527256537694, + "clip_ratio/high_mean": 0.0009296696362071089, + "clip_ratio/low_mean": 0.0008504205998178804, + "clip_ratio/low_min": 3.849515451292973e-05, + "clip_ratio/region_mean": 0.0017800902423914522, + "epoch": 1.494604841061534, + "grad_norm": 0.17286677658557892, + "learning_rate": 1e-06, + "loss": -0.0509, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0019069292393396609, + "clip_ratio/high_mean": 0.000733642582417815, + "clip_ratio/low_mean": 0.0004784600541825057, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012121026484237518, + "completions/clipped_ratio": 0.1316964285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2620.0, + "completions/mean_length": 1143.93310546875, + "completions/mean_terminated_length": 696.1902465820312, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 1.4969378827646544, + "grad_norm": 0.2399642914533615, + "learning_rate": 1e-06, + "loss": -0.0393, + "num_tokens": 96287404.0, + "reward": 0.5725446939468384, + "reward_std": 0.15018516778945923, + "rewards/verify_math_reward/mean": 0.5725446343421936, + "rewards/verify_math_reward/std": 0.49498558044433594, + "step": 641 + }, + { + "clip_ratio/high_max": 0.002361362647206988, + "clip_ratio/high_mean": 0.0009156956712104147, + "clip_ratio/low_mean": 0.0007014144121058052, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016171101087820716, + "epoch": 1.4992709244677749, + "grad_norm": 0.2206823080778122, + "learning_rate": 1e-06, + "loss": -0.0395, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0022667571320198476, + "clip_ratio/high_mean": 0.0009398542388225906, + "clip_ratio/low_mean": 0.0008125084077619249, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001752362644765526, + "epoch": 1.5016039661708953, + "grad_norm": 0.2108108103275299, + "learning_rate": 1e-06, + "loss": -0.0396, + "step": 643 + }, + { + "clip_ratio/high_max": 0.002244658535346389, + "clip_ratio/high_mean": 0.0008674152304593008, + "clip_ratio/low_mean": 0.0009937553195413784, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018611705527291633, + "epoch": 1.5039370078740157, + "grad_norm": 0.20998287200927734, + "learning_rate": 1e-06, + "loss": -0.0397, + "step": 644 + }, + { + "clip_ratio/high_max": 0.002750497020315379, + "clip_ratio/high_mean": 0.001084525643818779, + "clip_ratio/low_mean": 0.000655712927255081, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017402385419700295, + "completions/clipped_ratio": 0.1897321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3668.0, + "completions/mean_length": 1344.53125, + "completions/mean_terminated_length": 700.2479248046875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 1.5062700495771362, + "grad_norm": 0.2975298762321472, + "learning_rate": 1e-06, + "loss": -0.0639, + "num_tokens": 96891224.0, + "reward": 0.4933035969734192, + "reward_std": 0.2048664689064026, + "rewards/verify_math_reward/mean": 0.4933035671710968, + "rewards/verify_math_reward/std": 0.5002344250679016, + "step": 645 + }, + { + "clip_ratio/high_max": 0.003178340135491453, + "clip_ratio/high_mean": 0.0013070478162262589, + "clip_ratio/low_mean": 0.0008972011510195443, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002204249001806602, + "epoch": 1.5086030912802566, + "grad_norm": 0.2677991986274719, + "learning_rate": 1e-06, + "loss": -0.0642, + "step": 646 + }, + { + "clip_ratio/high_max": 0.003427130046475213, + "clip_ratio/high_mean": 0.0014177916236803867, + "clip_ratio/low_mean": 0.0010971521915053017, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0025149437788059004, + "epoch": 1.510936132983377, + "grad_norm": 0.21310929954051971, + "learning_rate": 1e-06, + "loss": -0.0644, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0029649059470102657, + "clip_ratio/high_mean": 0.001309787250647787, + "clip_ratio/low_mean": 0.0012619099179573823, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0025716971649671905, + "epoch": 1.5132691746864975, + "grad_norm": 0.2274923026561737, + "learning_rate": 1e-06, + "loss": -0.0645, + "step": 648 + }, + { + "clip_ratio/high_max": 0.002781429029710125, + "clip_ratio/high_mean": 0.001106526000512531, + "clip_ratio/low_mean": 0.0006171891363919713, + "clip_ratio/low_min": 3.19901673719869e-05, + "clip_ratio/region_mean": 0.00172371514418046, + "completions/clipped_ratio": 0.1953125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3042.0, + "completions/mean_length": 1327.8929443359375, + "completions/mean_terminated_length": 656.022216796875, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 1.5156022163896181, + "grad_norm": 0.3981626033782959, + "learning_rate": 1e-06, + "loss": -0.0705, + "num_tokens": 97465832.0, + "reward": 0.4977678656578064, + "reward_std": 0.19670122861862183, + "rewards/verify_math_reward/mean": 0.4977678656578064, + "rewards/verify_math_reward/std": 0.5002742409706116, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0034815168255590834, + "clip_ratio/high_mean": 0.0014658952386525925, + "clip_ratio/low_mean": 0.001077957467714441, + "clip_ratio/low_min": 3.458586161286803e-05, + "clip_ratio/region_mean": 0.002543852722737938, + "epoch": 1.5179352580927383, + "grad_norm": 0.34394311904907227, + "learning_rate": 1e-06, + "loss": -0.0708, + "step": 650 + }, + { + "clip_ratio/high_max": 0.00356647145235911, + "clip_ratio/high_mean": 0.0014375105420185719, + "clip_ratio/low_mean": 0.0013765477087872569, + "clip_ratio/low_min": 2.7237243102717912e-05, + "clip_ratio/region_mean": 0.0028140582362539135, + "epoch": 1.520268299795859, + "grad_norm": 0.34004664421081543, + "learning_rate": 1e-06, + "loss": -0.0711, + "step": 651 + }, + { + "clip_ratio/high_max": 0.003366349876159802, + "clip_ratio/high_mean": 0.0014541119999194052, + "clip_ratio/low_mean": 0.0016202135921048466, + "clip_ratio/low_min": 3.19901673719869e-05, + "clip_ratio/region_mean": 0.0030743255192646757, + "epoch": 1.5226013414989792, + "grad_norm": 0.2796070873737335, + "learning_rate": 1e-06, + "loss": -0.0712, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0021140613534953445, + "clip_ratio/high_mean": 0.0007855239236960188, + "clip_ratio/low_mean": 0.0005137370571901556, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012992609772481956, + "completions/clipped_ratio": 0.1640625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2928.0, + "completions/mean_length": 1189.734375, + "completions/mean_terminated_length": 619.3458251953125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 1.5249343832020998, + "grad_norm": 0.25421372056007385, + "learning_rate": 1e-06, + "loss": -0.069, + "num_tokens": 98023210.0, + "reward": 0.5267857313156128, + "reward_std": 0.15134702622890472, + "rewards/verify_math_reward/mean": 0.5267857313156128, + "rewards/verify_math_reward/std": 0.4995608627796173, + "step": 653 + }, + { + "clip_ratio/high_max": 0.002797064465994481, + "clip_ratio/high_mean": 0.0010685569450288313, + "clip_ratio/low_mean": 0.0007312407378776697, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001799797682906501, + "epoch": 1.52726742490522, + "grad_norm": 0.2175801396369934, + "learning_rate": 1e-06, + "loss": -0.0692, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0028732655955536757, + "clip_ratio/high_mean": 0.001066129141690908, + "clip_ratio/low_mean": 0.0008831094664856209, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019492386272759177, + "epoch": 1.5296004666083407, + "grad_norm": 0.20748473703861237, + "learning_rate": 1e-06, + "loss": -0.0694, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0029239171999506652, + "clip_ratio/high_mean": 0.0010204350219282787, + "clip_ratio/low_mean": 0.0011148508874612162, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021352858893806115, + "epoch": 1.531933508311461, + "grad_norm": 0.18779869377613068, + "learning_rate": 1e-06, + "loss": -0.0695, + "step": 656 + }, + { + "clip_ratio/high_max": 0.00239759481701185, + "clip_ratio/high_mean": 0.0010145321357413195, + "clip_ratio/low_mean": 0.0006554852961926372, + "clip_ratio/low_min": 1.1144793461426161e-05, + "clip_ratio/region_mean": 0.0016700173982826527, + "completions/clipped_ratio": 0.1819196428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3998.0, + "completions/mean_length": 1311.765625, + "completions/mean_terminated_length": 692.6248168945312, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 1.5342665500145816, + "grad_norm": 0.283419668674469, + "learning_rate": 1e-06, + "loss": -0.1052, + "num_tokens": 98622792.0, + "reward": 0.5613839626312256, + "reward_std": 0.19895784556865692, + "rewards/verify_math_reward/mean": 0.5613839030265808, + "rewards/verify_math_reward/std": 0.496494859457016, + "step": 657 + }, + { + "clip_ratio/high_max": 0.00306125360657461, + "clip_ratio/high_mean": 0.0013394121815508697, + "clip_ratio/low_mean": 0.0008616015857114689, + "clip_ratio/low_min": 2.059986763924826e-05, + "clip_ratio/region_mean": 0.00220101373270154, + "epoch": 1.536599591717702, + "grad_norm": 0.27444544434547424, + "learning_rate": 1e-06, + "loss": -0.1053, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0033300267459708266, + "clip_ratio/high_mean": 0.0012730805719911586, + "clip_ratio/low_mean": 0.000994399935734691, + "clip_ratio/low_min": 2.059986763924826e-05, + "clip_ratio/region_mean": 0.0022674804786220193, + "epoch": 1.5389326334208224, + "grad_norm": 0.20978401601314545, + "learning_rate": 1e-06, + "loss": -0.1055, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0031130103016039357, + "clip_ratio/high_mean": 0.001303149856539676, + "clip_ratio/low_mean": 0.0013752163686149288, + "clip_ratio/low_min": 4.119973527849652e-05, + "clip_ratio/region_mean": 0.0026783662542584352, + "epoch": 1.5412656751239429, + "grad_norm": 0.2526197135448456, + "learning_rate": 1e-06, + "loss": -0.1056, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0021811685219290666, + "clip_ratio/high_mean": 0.0008479951702611288, + "clip_ratio/low_mean": 0.0005780810424766969, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014260761854529846, + "completions/clipped_ratio": 0.1662946428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3428.0, + "completions/mean_length": 1176.641845703125, + "completions/mean_terminated_length": 594.3333129882812, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 1.5435987168270633, + "grad_norm": 0.2901129126548767, + "learning_rate": 1e-06, + "loss": -0.0516, + "num_tokens": 99160319.0, + "reward": 0.5691964626312256, + "reward_std": 0.14368948340415955, + "rewards/verify_math_reward/mean": 0.5691964030265808, + "rewards/verify_math_reward/std": 0.4954652488231659, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0025631569806137122, + "clip_ratio/high_mean": 0.0010572301398497075, + "clip_ratio/low_mean": 0.0007841288665986212, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018413590514683165, + "epoch": 1.5459317585301837, + "grad_norm": 0.23473727703094482, + "learning_rate": 1e-06, + "loss": -0.0518, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0025974552845582366, + "clip_ratio/high_mean": 0.0010332157580705825, + "clip_ratio/low_mean": 0.0009663371947681298, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00199955292919185, + "epoch": 1.5482648002333042, + "grad_norm": 0.2675100862979889, + "learning_rate": 1e-06, + "loss": -0.052, + "step": 663 + }, + { + "clip_ratio/high_max": 0.00247337876498932, + "clip_ratio/high_mean": 0.0009737393938848982, + "clip_ratio/low_mean": 0.0012372789251458016, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022110183563199826, + "epoch": 1.5505978419364246, + "grad_norm": 0.18326008319854736, + "learning_rate": 1e-06, + "loss": -0.0521, + "step": 664 + }, + { + "clip_ratio/high_max": 0.002578391147835646, + "clip_ratio/high_mean": 0.001082983690139372, + "clip_ratio/low_mean": 0.0006028006682754494, + "clip_ratio/low_min": 7.835025826352648e-06, + "clip_ratio/region_mean": 0.0016857843802426942, + "completions/clipped_ratio": 0.1540178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3464.0, + "completions/mean_length": 1177.068115234375, + "completions/mean_terminated_length": 645.653076171875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 1.552930883639545, + "grad_norm": 0.31763285398483276, + "learning_rate": 1e-06, + "loss": -0.0767, + "num_tokens": 99732628.0, + "reward": 0.5881696939468384, + "reward_std": 0.1853206306695938, + "rewards/verify_math_reward/mean": 0.5881696343421936, + "rewards/verify_math_reward/std": 0.4924395978450775, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0025883031557896174, + "clip_ratio/high_mean": 0.0011414089749450795, + "clip_ratio/low_mean": 0.0008020186633075355, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019434275673120283, + "epoch": 1.5552639253426657, + "grad_norm": 0.2709817588329315, + "learning_rate": 1e-06, + "loss": -0.0769, + "step": 666 + }, + { + "clip_ratio/high_max": 0.00291282403486548, + "clip_ratio/high_mean": 0.0012065014489053283, + "clip_ratio/low_mean": 0.0009946562222467037, + "clip_ratio/low_min": 7.835025826352648e-06, + "clip_ratio/region_mean": 0.0022011576729710214, + "epoch": 1.5575969670457859, + "grad_norm": 0.2394454926252365, + "learning_rate": 1e-06, + "loss": -0.0772, + "step": 667 + }, + { + "clip_ratio/high_max": 0.002633153955684975, + "clip_ratio/high_mean": 0.001112860440116492, + "clip_ratio/low_mean": 0.0012822716889786534, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002395132127276156, + "epoch": 1.5599300087489065, + "grad_norm": 0.22501912713050842, + "learning_rate": 1e-06, + "loss": -0.0773, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0025296201347373426, + "clip_ratio/high_mean": 0.0008875593648554059, + "clip_ratio/low_mean": 0.00047986642357500386, + "clip_ratio/low_min": 1.5363815691671334e-05, + "clip_ratio/region_mean": 0.0013674257788807154, + "completions/clipped_ratio": 0.1685267857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3748.0, + "completions/mean_length": 1283.4810791015625, + "completions/mean_terminated_length": 713.4268798828125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 1.5622630504520267, + "grad_norm": 0.2584800720214844, + "learning_rate": 1e-06, + "loss": -0.0459, + "num_tokens": 100348475.0, + "reward": 0.5491071939468384, + "reward_std": 0.14849409461021423, + "rewards/verify_math_reward/mean": 0.5491071343421936, + "rewards/verify_math_reward/std": 0.49786055088043213, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0023968944551597815, + "clip_ratio/high_mean": 0.0009445089151540742, + "clip_ratio/low_mean": 0.0006333161124985054, + "clip_ratio/low_min": 1.5363815691671334e-05, + "clip_ratio/region_mean": 0.001577825049025705, + "epoch": 1.5645960921551474, + "grad_norm": 0.23188327252864838, + "learning_rate": 1e-06, + "loss": -0.046, + "step": 670 + }, + { + "clip_ratio/high_max": 0.00333152122766478, + "clip_ratio/high_mean": 0.0010798854073073016, + "clip_ratio/low_mean": 0.0007709232504566899, + "clip_ratio/low_min": 3.16937112074811e-05, + "clip_ratio/region_mean": 0.001850808686867822, + "epoch": 1.5669291338582676, + "grad_norm": 0.1995295137166977, + "learning_rate": 1e-06, + "loss": -0.0462, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0030446229757217225, + "clip_ratio/high_mean": 0.0010142238497792277, + "clip_ratio/low_mean": 0.0009125107726504211, + "clip_ratio/low_min": 7.923427801870275e-06, + "clip_ratio/region_mean": 0.0019267346178821754, + "epoch": 1.5692621755613883, + "grad_norm": 0.18869951367378235, + "learning_rate": 1e-06, + "loss": -0.0463, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0021819696048623882, + "clip_ratio/high_mean": 0.0008233503558585653, + "clip_ratio/low_mean": 0.0006891033935971791, + "clip_ratio/low_min": 8.679349775775336e-06, + "clip_ratio/region_mean": 0.001512453796749469, + "completions/clipped_ratio": 0.1964285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3989.0, + "completions/mean_length": 1307.53466796875, + "completions/mean_terminated_length": 625.9097290039062, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 1.5715952172645085, + "grad_norm": 0.34236404299736023, + "learning_rate": 1e-06, + "loss": -0.0669, + "num_tokens": 100886930.0, + "reward": 0.5491071939468384, + "reward_std": 0.18595930933952332, + "rewards/verify_math_reward/mean": 0.5491071343421936, + "rewards/verify_math_reward/std": 0.49786055088043213, + "step": 673 + }, + { + "clip_ratio/high_max": 0.003019629279151559, + "clip_ratio/high_mean": 0.0011409016224206425, + "clip_ratio/low_mean": 0.001045637009156053, + "clip_ratio/low_min": 4.287306728656404e-05, + "clip_ratio/region_mean": 0.0021865386806894094, + "epoch": 1.5739282589676291, + "grad_norm": 0.2636945843696594, + "learning_rate": 1e-06, + "loss": -0.0673, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0031311263737734407, + "clip_ratio/high_mean": 0.001144453988672467, + "clip_ratio/low_mean": 0.0013594984848168679, + "clip_ratio/low_min": 2.5514365916023962e-05, + "clip_ratio/region_mean": 0.0025039524407475255, + "epoch": 1.5762613006707495, + "grad_norm": 0.24746200442314148, + "learning_rate": 1e-06, + "loss": -0.0675, + "step": 675 + }, + { + "clip_ratio/high_max": 0.002691456291358918, + "clip_ratio/high_mean": 0.0010489637461432721, + "clip_ratio/low_mean": 0.0015580262552248314, + "clip_ratio/low_min": 4.2611225580913015e-05, + "clip_ratio/region_mean": 0.0026069900413858704, + "epoch": 1.57859434237387, + "grad_norm": 0.24600592255592346, + "learning_rate": 1e-06, + "loss": -0.0676, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0024583497288404033, + "clip_ratio/high_mean": 0.0009814610311877914, + "clip_ratio/low_mean": 0.000726520012904075, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017079810531868134, + "completions/clipped_ratio": 0.1350446428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4078.0, + "completions/mean_length": 1136.7723388671875, + "completions/mean_terminated_length": 674.7509765625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 1.5809273840769904, + "grad_norm": 0.31176334619522095, + "learning_rate": 1e-06, + "loss": -0.0515, + "num_tokens": 101499030.0, + "reward": 0.546875, + "reward_std": 0.188248410820961, + "rewards/verify_math_reward/mean": 0.546875, + "rewards/verify_math_reward/std": 0.4980759024620056, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0029717501529376023, + "clip_ratio/high_mean": 0.0011934057911275886, + "clip_ratio/low_mean": 0.0011119712435174733, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002305377049196977, + "epoch": 1.5832604257801108, + "grad_norm": 0.27647027373313904, + "learning_rate": 1e-06, + "loss": -0.0518, + "step": 678 + }, + { + "clip_ratio/high_max": 0.003101067639363464, + "clip_ratio/high_mean": 0.0012359481625026092, + "clip_ratio/low_mean": 0.0013322515715117333, + "clip_ratio/low_min": 2.84349407593254e-05, + "clip_ratio/region_mean": 0.0025681997431092896, + "epoch": 1.5855934674832313, + "grad_norm": 0.25151047110557556, + "learning_rate": 1e-06, + "loss": -0.052, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0029707483190577477, + "clip_ratio/high_mean": 0.0011499096181069035, + "clip_ratio/low_mean": 0.001467522828534129, + "clip_ratio/low_min": 2.84349407593254e-05, + "clip_ratio/region_mean": 0.002617432506667683, + "epoch": 1.5879265091863517, + "grad_norm": 0.23571977019309998, + "learning_rate": 1e-06, + "loss": -0.0521, + "step": 680 + }, + { + "clip_ratio/high_max": 0.002559652282798197, + "clip_ratio/high_mean": 0.0010860990314540686, + "clip_ratio/low_mean": 0.0007604357206218992, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018465347384335473, + "completions/clipped_ratio": 0.2008928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3538.0, + "completions/mean_length": 1334.6796875, + "completions/mean_terminated_length": 640.4929809570312, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 1.5902595508894721, + "grad_norm": 0.29189226031303406, + "learning_rate": 1e-06, + "loss": -0.0747, + "num_tokens": 102048711.0, + "reward": 0.5245535969734192, + "reward_std": 0.1755138486623764, + "rewards/verify_math_reward/mean": 0.5245535969734192, + "rewards/verify_math_reward/std": 0.4996756613254547, + "step": 681 + }, + { + "clip_ratio/high_max": 0.002816863951011328, + "clip_ratio/high_mean": 0.0011096646394435083, + "clip_ratio/low_mean": 0.0011371720538591035, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002246836684207665, + "epoch": 1.5925925925925926, + "grad_norm": 0.25846952199935913, + "learning_rate": 1e-06, + "loss": -0.0749, + "step": 682 + }, + { + "clip_ratio/high_max": 0.003701198067574296, + "clip_ratio/high_mean": 0.0013540929376176791, + "clip_ratio/low_mean": 0.001283191236325365, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026372841457487084, + "epoch": 1.594925634295713, + "grad_norm": 0.20855620503425598, + "learning_rate": 1e-06, + "loss": -0.0752, + "step": 683 + }, + { + "clip_ratio/high_max": 0.003204186574293999, + "clip_ratio/high_mean": 0.001233112501722644, + "clip_ratio/low_mean": 0.001455334067941294, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026884465478360653, + "epoch": 1.5972586759988334, + "grad_norm": 0.21366524696350098, + "learning_rate": 1e-06, + "loss": -0.0752, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0025050155745702796, + "clip_ratio/high_mean": 0.0009705617230792996, + "clip_ratio/low_mean": 0.0006182609540701378, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015888227062532678, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3316.0, + "completions/mean_length": 1324.4910888671875, + "completions/mean_terminated_length": 684.912109375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 1.599591717701954, + "grad_norm": 0.2937885522842407, + "learning_rate": 1e-06, + "loss": -0.06, + "num_tokens": 102639079.0, + "reward": 0.5613839626312256, + "reward_std": 0.20324954390525818, + "rewards/verify_math_reward/mean": 0.5613839030265808, + "rewards/verify_math_reward/std": 0.496494859457016, + "step": 685 + }, + { + "clip_ratio/high_max": 0.003114350685791578, + "clip_ratio/high_mean": 0.001153579549281858, + "clip_ratio/low_mean": 0.0008343398549186531, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019879194151144475, + "epoch": 1.6019247594050743, + "grad_norm": 0.22430022060871124, + "learning_rate": 1e-06, + "loss": -0.0603, + "step": 686 + }, + { + "clip_ratio/high_max": 0.002859890679246746, + "clip_ratio/high_mean": 0.0012254288703843486, + "clip_ratio/low_mean": 0.0010812852524395566, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023067141009960324, + "epoch": 1.604257801108195, + "grad_norm": 0.2645739018917084, + "learning_rate": 1e-06, + "loss": -0.0604, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0028396494963089935, + "clip_ratio/high_mean": 0.001110363347834209, + "clip_ratio/low_mean": 0.0012552885236800648, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023656518242205493, + "epoch": 1.6065908428113151, + "grad_norm": 0.25388264656066895, + "learning_rate": 1e-06, + "loss": -0.0604, + "step": 688 + }, + { + "clip_ratio/high_max": 0.002372793183894828, + "clip_ratio/high_mean": 0.0010088597427966306, + "clip_ratio/low_mean": 0.0006163730804473744, + "clip_ratio/low_min": 4.205921868560836e-05, + "clip_ratio/region_mean": 0.0016252328423433937, + "completions/clipped_ratio": 0.1551339285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3867.0, + "completions/mean_length": 1164.2421875, + "completions/mean_terminated_length": 625.9141845703125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 1.6089238845144358, + "grad_norm": 0.2801320552825928, + "learning_rate": 1e-06, + "loss": -0.0958, + "num_tokens": 103193344.0, + "reward": 0.5714285969734192, + "reward_std": 0.19068947434425354, + "rewards/verify_math_reward/mean": 0.5714285969734192, + "rewards/verify_math_reward/std": 0.49514803290367126, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0032055882111308165, + "clip_ratio/high_mean": 0.0013251628406578675, + "clip_ratio/low_mean": 0.0007281958114617737, + "clip_ratio/low_min": 1.842570782173425e-05, + "clip_ratio/region_mean": 0.0020533585993689485, + "epoch": 1.611256926217556, + "grad_norm": 0.23484808206558228, + "learning_rate": 1e-06, + "loss": -0.096, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0031799277930986136, + "clip_ratio/high_mean": 0.0013861646511941217, + "clip_ratio/low_mean": 0.0009580936939528328, + "clip_ratio/low_min": 4.6064269554335624e-05, + "clip_ratio/region_mean": 0.002344258398807142, + "epoch": 1.6135899679206767, + "grad_norm": 0.21078988909721375, + "learning_rate": 1e-06, + "loss": -0.0962, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0031392294986289926, + "clip_ratio/high_mean": 0.0013050017587374896, + "clip_ratio/low_mean": 0.0011217644498628943, + "clip_ratio/low_min": 4.6064269554335624e-05, + "clip_ratio/region_mean": 0.002426766171993222, + "epoch": 1.6159230096237969, + "grad_norm": 0.23015913367271423, + "learning_rate": 1e-06, + "loss": -0.0962, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0019772380546783097, + "clip_ratio/high_mean": 0.0007786658425175119, + "clip_ratio/low_mean": 0.0005568928290813346, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001335558725259034, + "completions/clipped_ratio": 0.2053571428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3617.0, + "completions/mean_length": 1319.727783203125, + "completions/mean_terminated_length": 602.2640380859375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 1.6182560513269175, + "grad_norm": 0.30670201778411865, + "learning_rate": 1e-06, + "loss": -0.1051, + "num_tokens": 103712468.0, + "reward": 0.5502232313156128, + "reward_std": 0.18032027781009674, + "rewards/verify_math_reward/mean": 0.5502232313156128, + "rewards/verify_math_reward/std": 0.49774909019470215, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0029129241302143782, + "clip_ratio/high_mean": 0.0011786021677835379, + "clip_ratio/low_mean": 0.0008487720242555952, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002027374204772059, + "epoch": 1.620589093030038, + "grad_norm": 0.3664519786834717, + "learning_rate": 1e-06, + "loss": -0.1054, + "step": 694 + }, + { + "clip_ratio/high_max": 0.002840742585249245, + "clip_ratio/high_mean": 0.0011297030687273946, + "clip_ratio/low_mean": 0.0009428865196241532, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020725895592477173, + "epoch": 1.6229221347331584, + "grad_norm": 0.25414541363716125, + "learning_rate": 1e-06, + "loss": -0.1056, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0028106456229579635, + "clip_ratio/high_mean": 0.0011537669524841476, + "clip_ratio/low_mean": 0.0011728714871424017, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023266384232556447, + "epoch": 1.6252551764362788, + "grad_norm": 0.24437932670116425, + "learning_rate": 1e-06, + "loss": -0.1057, + "step": 696 + }, + { + "clip_ratio/high_max": 0.002399275341304019, + "clip_ratio/high_mean": 0.0008898892119759694, + "clip_ratio/low_mean": 0.000654937247873022, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001544826453027781, + "completions/clipped_ratio": 0.1975446428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3361.0, + "completions/mean_length": 1324.7366943359375, + "completions/mean_terminated_length": 642.5202026367188, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 1.6275882181393992, + "grad_norm": 0.31290677189826965, + "learning_rate": 1e-06, + "loss": -0.0769, + "num_tokens": 104282808.0, + "reward": 0.5223214626312256, + "reward_std": 0.1779131293296814, + "rewards/verify_math_reward/mean": 0.5223214030265808, + "rewards/verify_math_reward/std": 0.49978047609329224, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0031805944818188436, + "clip_ratio/high_mean": 0.0011535340945556527, + "clip_ratio/low_mean": 0.0007945833312987816, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019481174167594872, + "epoch": 1.6299212598425197, + "grad_norm": 0.23811165988445282, + "learning_rate": 1e-06, + "loss": -0.0771, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0027703681626007892, + "clip_ratio/high_mean": 0.0010411006078356877, + "clip_ratio/low_mean": 0.0009474635662627406, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019885641668224707, + "epoch": 1.63225430154564, + "grad_norm": 0.2243320345878601, + "learning_rate": 1e-06, + "loss": -0.0772, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0030867442983435467, + "clip_ratio/high_mean": 0.0010878828761633486, + "clip_ratio/low_mean": 0.001150815664914262, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002238698521978222, + "epoch": 1.6345873432487605, + "grad_norm": 0.18212801218032837, + "learning_rate": 1e-06, + "loss": -0.0774, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0026252361203660257, + "clip_ratio/high_mean": 0.0010942014032480074, + "clip_ratio/low_mean": 0.0005422721651484608, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001636473556573037, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3663.0, + "completions/mean_length": 1305.7723388671875, + "completions/mean_terminated_length": 661.8736572265625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 1.636920384951881, + "grad_norm": 0.33585652709007263, + "learning_rate": 1e-06, + "loss": -0.0916, + "num_tokens": 104854956.0, + "reward": 0.5725446939468384, + "reward_std": 0.1841912716627121, + "rewards/verify_math_reward/mean": 0.5725446343421936, + "rewards/verify_math_reward/std": 0.49498558044433594, + "step": 701 + }, + { + "clip_ratio/high_max": 0.00304038253671024, + "clip_ratio/high_mean": 0.0013526170332625043, + "clip_ratio/low_mean": 0.0007281387952389196, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020807558539672755, + "epoch": 1.6392534266550016, + "grad_norm": 0.24745501577854156, + "learning_rate": 1e-06, + "loss": -0.0918, + "step": 702 + }, + { + "clip_ratio/high_max": 0.002915855307946913, + "clip_ratio/high_mean": 0.0012921984380227514, + "clip_ratio/low_mean": 0.0009440897629247047, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002236288222775329, + "epoch": 1.6415864683581218, + "grad_norm": 0.2093912959098816, + "learning_rate": 1e-06, + "loss": -0.0921, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0027673358854372054, + "clip_ratio/high_mean": 0.0013007218585698865, + "clip_ratio/low_mean": 0.0010847195226233453, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023854413448134437, + "epoch": 1.6439195100612425, + "grad_norm": 0.2719499170780182, + "learning_rate": 1e-06, + "loss": -0.0922, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0023966357875906397, + "clip_ratio/high_mean": 0.0009143912266154075, + "clip_ratio/low_mean": 0.0007818282201697002, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016962194422376342, + "completions/clipped_ratio": 0.2165178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3769.0, + "completions/mean_length": 1385.1864013671875, + "completions/mean_terminated_length": 636.0441284179688, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 1.6462525517643627, + "grad_norm": 0.34167617559432983, + "learning_rate": 1e-06, + "loss": -0.0604, + "num_tokens": 105392075.0, + "reward": 0.4654017984867096, + "reward_std": 0.19155599176883698, + "rewards/verify_math_reward/mean": 0.4654017984867096, + "rewards/verify_math_reward/std": 0.4990801215171814, + "step": 705 + }, + { + "clip_ratio/high_max": 0.003016867267433554, + "clip_ratio/high_mean": 0.0011957475217059255, + "clip_ratio/low_mean": 0.0011352539168001385, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023310014148592018, + "epoch": 1.6485855934674833, + "grad_norm": 0.300221711397171, + "learning_rate": 1e-06, + "loss": -0.0606, + "step": 706 + }, + { + "clip_ratio/high_max": 0.002963373059174046, + "clip_ratio/high_mean": 0.0010386884860054124, + "clip_ratio/low_mean": 0.0012534869038063334, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002292175340699032, + "epoch": 1.6509186351706036, + "grad_norm": 0.25997117161750793, + "learning_rate": 1e-06, + "loss": -0.0609, + "step": 707 + }, + { + "clip_ratio/high_max": 0.002642484017997049, + "clip_ratio/high_mean": 0.001068166990080499, + "clip_ratio/low_mean": 0.0016632107199257007, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002731377651798539, + "epoch": 1.6532516768737242, + "grad_norm": 0.29943886399269104, + "learning_rate": 1e-06, + "loss": -0.0609, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0024212583593907766, + "clip_ratio/high_mean": 0.0010675480953068472, + "clip_ratio/low_mean": 0.0004458831745068892, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015134312634472735, + "completions/clipped_ratio": 0.1852678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3461.0, + "completions/mean_length": 1296.90185546875, + "completions/mean_terminated_length": 660.39453125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 1.6555847185768444, + "grad_norm": 0.30853670835494995, + "learning_rate": 1e-06, + "loss": -0.0938, + "num_tokens": 105957619.0, + "reward": 0.5859375, + "reward_std": 0.19959722459316254, + "rewards/verify_math_reward/mean": 0.5859375, + "rewards/verify_math_reward/std": 0.4928344786167145, + "step": 709 + }, + { + "clip_ratio/high_max": 0.003249817520554643, + "clip_ratio/high_mean": 0.0013167445504222997, + "clip_ratio/low_mean": 0.0007628459834450041, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020795905438717455, + "epoch": 1.657917760279965, + "grad_norm": 0.2825876772403717, + "learning_rate": 1e-06, + "loss": -0.0941, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0033827013758127578, + "clip_ratio/high_mean": 0.001281385702895932, + "clip_ratio/low_mean": 0.0009322790992882801, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002213664738519583, + "epoch": 1.6602508019830855, + "grad_norm": 0.21970489621162415, + "learning_rate": 1e-06, + "loss": -0.0942, + "step": 711 + }, + { + "clip_ratio/high_max": 0.003062765797949396, + "clip_ratio/high_mean": 0.00115851747978013, + "clip_ratio/low_mean": 0.001118047950512846, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022765654284739867, + "epoch": 1.662583843686206, + "grad_norm": 0.22429729998111725, + "learning_rate": 1e-06, + "loss": -0.0943, + "step": 712 + }, + { + "clip_ratio/high_max": 0.002440858632326126, + "clip_ratio/high_mean": 0.0009094994184124516, + "clip_ratio/low_mean": 0.0006458879433921538, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015553873381577432, + "completions/clipped_ratio": 0.1696428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3330.0, + "completions/mean_length": 1200.372802734375, + "completions/mean_terminated_length": 608.7930297851562, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 1.6649168853893264, + "grad_norm": 0.3057929575443268, + "learning_rate": 1e-06, + "loss": -0.055, + "num_tokens": 106492745.0, + "reward": 0.5446428656578064, + "reward_std": 0.18821631371974945, + "rewards/verify_math_reward/mean": 0.5446428656578064, + "rewards/verify_math_reward/std": 0.4982811510562897, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0029230081163404975, + "clip_ratio/high_mean": 0.0011955939498875523, + "clip_ratio/low_mean": 0.0008712396156624891, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020668336073867977, + "epoch": 1.6672499270924468, + "grad_norm": 0.23520135879516602, + "learning_rate": 1e-06, + "loss": -0.0552, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0028584764659171924, + "clip_ratio/high_mean": 0.0011451138452684972, + "clip_ratio/low_mean": 0.0010565156408119947, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022016294751665555, + "epoch": 1.6695829687955672, + "grad_norm": 0.23191626369953156, + "learning_rate": 1e-06, + "loss": -0.0554, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0028426450662664138, + "clip_ratio/high_mean": 0.0010478730182512663, + "clip_ratio/low_mean": 0.001281688870221842, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002329561953956727, + "epoch": 1.6719160104986877, + "grad_norm": 0.24943673610687256, + "learning_rate": 1e-06, + "loss": -0.0555, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0026324251739424653, + "clip_ratio/high_mean": 0.0009574365285516251, + "clip_ratio/low_mean": 0.0007995056948857382, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001756942248903215, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 1261.8951416015625, + "completions/mean_terminated_length": 607.8709106445312, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 1.674249052201808, + "grad_norm": 0.3139522969722748, + "learning_rate": 1e-06, + "loss": -0.0796, + "num_tokens": 107022179.0, + "reward": 0.5558035969734192, + "reward_std": 0.18125459551811218, + "rewards/verify_math_reward/mean": 0.5558035969734192, + "rewards/verify_math_reward/std": 0.49715372920036316, + "step": 717 + }, + { + "clip_ratio/high_max": 0.003595037505874643, + "clip_ratio/high_mean": 0.0011868317760672653, + "clip_ratio/low_mean": 0.0010414816260890802, + "clip_ratio/low_min": 2.4811433831928298e-05, + "clip_ratio/region_mean": 0.0022283133985183667, + "epoch": 1.6765820939049285, + "grad_norm": 0.28548964858055115, + "learning_rate": 1e-06, + "loss": -0.08, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0035372318197914865, + "clip_ratio/high_mean": 0.001203651931064087, + "clip_ratio/low_mean": 0.0012385282934701536, + "clip_ratio/low_min": 5.047351623943541e-05, + "clip_ratio/region_mean": 0.002442180208163336, + "epoch": 1.678915135608049, + "grad_norm": 0.24397175014019012, + "learning_rate": 1e-06, + "loss": -0.0802, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0033241948622162454, + "clip_ratio/high_mean": 0.0010891852689383086, + "clip_ratio/low_mean": 0.0014839978211966809, + "clip_ratio/low_min": 3.90482464354136e-05, + "clip_ratio/region_mean": 0.0025731830755830742, + "epoch": 1.6812481773111694, + "grad_norm": 0.2252047061920166, + "learning_rate": 1e-06, + "loss": -0.0802, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0020865477781626396, + "clip_ratio/high_mean": 0.0007889104326750385, + "clip_ratio/low_mean": 0.0005723328638396197, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013612432612717384, + "completions/clipped_ratio": 0.1685267857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3685.0, + "completions/mean_length": 1201.063720703125, + "completions/mean_terminated_length": 614.3047485351562, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 1.68358121901429, + "grad_norm": 0.4459717273712158, + "learning_rate": 1e-06, + "loss": -0.072, + "num_tokens": 107562684.0, + "reward": 0.5803571939468384, + "reward_std": 0.17078480124473572, + "rewards/verify_math_reward/mean": 0.5803571343421936, + "rewards/verify_math_reward/std": 0.4937761127948761, + "step": 721 + }, + { + "clip_ratio/high_max": 0.002799970516207395, + "clip_ratio/high_mean": 0.0010848278343473794, + "clip_ratio/low_mean": 0.0008371281196559721, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019219559399061836, + "epoch": 1.6859142607174102, + "grad_norm": 0.2520245909690857, + "learning_rate": 1e-06, + "loss": -0.0723, + "step": 722 + }, + { + "clip_ratio/high_max": 0.002626189547299873, + "clip_ratio/high_mean": 0.0010460953490110114, + "clip_ratio/low_mean": 0.000988977945780789, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002035073241131613, + "epoch": 1.688247302420531, + "grad_norm": 0.2172216922044754, + "learning_rate": 1e-06, + "loss": -0.0725, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0027866967429872602, + "clip_ratio/high_mean": 0.0009972511488740565, + "clip_ratio/low_mean": 0.0012106410231353948, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022078921720094513, + "epoch": 1.690580344123651, + "grad_norm": 0.23183082044124603, + "learning_rate": 1e-06, + "loss": -0.0725, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0023170647109509446, + "clip_ratio/high_mean": 0.0008763284749875311, + "clip_ratio/low_mean": 0.0005670832501891709, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014434117256314494, + "completions/clipped_ratio": 0.2165178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3758.0, + "completions/mean_length": 1413.1998291015625, + "completions/mean_terminated_length": 671.7991333007812, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 1.6929133858267718, + "grad_norm": 0.30320650339126587, + "learning_rate": 1e-06, + "loss": -0.0736, + "num_tokens": 108138791.0, + "reward": 0.4910714626312256, + "reward_std": 0.16653545200824738, + "rewards/verify_math_reward/mean": 0.4910714328289032, + "rewards/verify_math_reward/std": 0.5001994967460632, + "step": 725 + }, + { + "clip_ratio/high_max": 0.003044892131583765, + "clip_ratio/high_mean": 0.0011829018894786714, + "clip_ratio/low_mean": 0.0008012801918084733, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019841820576402824, + "epoch": 1.695246427529892, + "grad_norm": 0.2436947077512741, + "learning_rate": 1e-06, + "loss": -0.0739, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0032057481730589643, + "clip_ratio/high_mean": 0.0011772155739890877, + "clip_ratio/low_mean": 0.0009244151933671674, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021016307546233293, + "epoch": 1.6975794692330126, + "grad_norm": 0.21844151616096497, + "learning_rate": 1e-06, + "loss": -0.074, + "step": 727 + }, + { + "clip_ratio/high_max": 0.002946708300441969, + "clip_ratio/high_mean": 0.0011590781032282393, + "clip_ratio/low_mean": 0.0011810414580395445, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002340119535801932, + "epoch": 1.6999125109361328, + "grad_norm": 0.19436538219451904, + "learning_rate": 1e-06, + "loss": -0.0741, + "step": 728 + }, + { + "clip_ratio/high_max": 0.002840550339897163, + "clip_ratio/high_mean": 0.0010615747232805006, + "clip_ratio/low_mean": 0.0006888908933433413, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001750465566146886, + "completions/clipped_ratio": 0.1372767857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3706.0, + "completions/mean_length": 1066.88623046875, + "completions/mean_terminated_length": 584.8926391601562, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 1.7022455526392535, + "grad_norm": 0.3603469431400299, + "learning_rate": 1e-06, + "loss": -0.0625, + "num_tokens": 108686689.0, + "reward": 0.6272321939468384, + "reward_std": 0.1584189236164093, + "rewards/verify_math_reward/mean": 0.6272321343421936, + "rewards/verify_math_reward/std": 0.4838111698627472, + "step": 729 + }, + { + "clip_ratio/high_max": 0.00358756499554147, + "clip_ratio/high_mean": 0.001336411858574138, + "clip_ratio/low_mean": 0.001047273280164518, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023836851287342142, + "epoch": 1.704578594342374, + "grad_norm": 0.3039668798446655, + "learning_rate": 1e-06, + "loss": -0.0628, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0037036568028270267, + "clip_ratio/high_mean": 0.001329478091065539, + "clip_ratio/low_mean": 0.0013035761201081186, + "clip_ratio/low_min": 1.1415525477787014e-05, + "clip_ratio/region_mean": 0.00263305424596183, + "epoch": 1.7069116360454943, + "grad_norm": 0.27185073494911194, + "learning_rate": 1e-06, + "loss": -0.0629, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0032777939195511863, + "clip_ratio/high_mean": 0.001269702181161847, + "clip_ratio/low_mean": 0.0014182024151523365, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026879046636167914, + "epoch": 1.7092446777486148, + "grad_norm": 0.2635992765426636, + "learning_rate": 1e-06, + "loss": -0.063, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0027937005579588003, + "clip_ratio/high_mean": 0.0009766038310772274, + "clip_ratio/low_mean": 0.0004562619997159345, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014328658253361937, + "completions/clipped_ratio": 0.2087053571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4022.0, + "completions/mean_length": 1386.69873046875, + "completions/mean_terminated_length": 672.1156616210938, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 1.7115777194517352, + "grad_norm": 0.2532927393913269, + "learning_rate": 1e-06, + "loss": -0.1131, + "num_tokens": 109256067.0, + "reward": 0.5223214626312256, + "reward_std": 0.16014528274536133, + "rewards/verify_math_reward/mean": 0.5223214030265808, + "rewards/verify_math_reward/std": 0.49978047609329224, + "step": 733 + }, + { + "clip_ratio/high_max": 0.003094894185778685, + "clip_ratio/high_mean": 0.0010996055279974826, + "clip_ratio/low_mean": 0.0005157897130629863, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016153952456079423, + "epoch": 1.7139107611548556, + "grad_norm": 0.27027636766433716, + "learning_rate": 1e-06, + "loss": -0.1133, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0035217979311710224, + "clip_ratio/high_mean": 0.0012316461052250816, + "clip_ratio/low_mean": 0.0007063940056468709, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019380400262889452, + "epoch": 1.716243802857976, + "grad_norm": 0.20090660452842712, + "learning_rate": 1e-06, + "loss": -0.1135, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0032341975238523446, + "clip_ratio/high_mean": 0.001142957709816983, + "clip_ratio/low_mean": 0.0009098773225559853, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020528350796666928, + "epoch": 1.7185768445610965, + "grad_norm": 0.19518056511878967, + "learning_rate": 1e-06, + "loss": -0.1136, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0020187238551443443, + "clip_ratio/high_mean": 0.0007821783510735258, + "clip_ratio/low_mean": 0.000512348198753898, + "clip_ratio/low_min": 1.1285662367299665e-05, + "clip_ratio/region_mean": 0.001294526558922371, + "completions/clipped_ratio": 0.1852678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3715.0, + "completions/mean_length": 1289.1820068359375, + "completions/mean_terminated_length": 650.919189453125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 1.720909886264217, + "grad_norm": 0.27669376134872437, + "learning_rate": 1e-06, + "loss": -0.0785, + "num_tokens": 109817022.0, + "reward": 0.5111607313156128, + "reward_std": 0.16732315719127655, + "rewards/verify_math_reward/mean": 0.5111607313156128, + "rewards/verify_math_reward/std": 0.5001546144485474, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0022142234665807337, + "clip_ratio/high_mean": 0.000889992157681263, + "clip_ratio/low_mean": 0.0007480007707272307, + "clip_ratio/low_min": 2.4206041416618973e-05, + "clip_ratio/region_mean": 0.00163799295478384, + "epoch": 1.7232429279673376, + "grad_norm": 0.26892563700675964, + "learning_rate": 1e-06, + "loss": -0.0787, + "step": 738 + }, + { + "clip_ratio/high_max": 0.002578092593466863, + "clip_ratio/high_mean": 0.0010435152689751703, + "clip_ratio/low_mean": 0.0008472703430015827, + "clip_ratio/low_min": 2.257132473459933e-05, + "clip_ratio/region_mean": 0.0018907856065197848, + "epoch": 1.7255759696704578, + "grad_norm": 0.1900540441274643, + "learning_rate": 1e-06, + "loss": -0.079, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0020848819622187875, + "clip_ratio/high_mean": 0.0008317502379213693, + "clip_ratio/low_mean": 0.001126940247559105, + "clip_ratio/low_min": 3.3856988011393696e-05, + "clip_ratio/region_mean": 0.0019586904818424955, + "epoch": 1.7279090113735784, + "grad_norm": 0.1855372041463852, + "learning_rate": 1e-06, + "loss": -0.079, + "step": 740 + }, + { + "clip_ratio/high_max": 0.002750308034592308, + "clip_ratio/high_mean": 0.001090041017960175, + "clip_ratio/low_mean": 0.0005552698330575367, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016453108364657965, + "completions/clipped_ratio": 0.2209821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3175.0, + "completions/mean_length": 1397.0457763671875, + "completions/mean_terminated_length": 631.4398803710938, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 1.7302420530766986, + "grad_norm": 0.37509623169898987, + "learning_rate": 1e-06, + "loss": -0.0531, + "num_tokens": 110349615.0, + "reward": 0.4687500298023224, + "reward_std": 0.164472758769989, + "rewards/verify_math_reward/mean": 0.46875, + "rewards/verify_math_reward/std": 0.4993011951446533, + "step": 741 + }, + { + "clip_ratio/high_max": 0.00314707585494034, + "clip_ratio/high_mean": 0.001330789462372195, + "clip_ratio/low_mean": 0.0007947428930492606, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021255323226796463, + "epoch": 1.7325750947798193, + "grad_norm": 0.2796333134174347, + "learning_rate": 1e-06, + "loss": -0.0534, + "step": 742 + }, + { + "clip_ratio/high_max": 0.003252979200624395, + "clip_ratio/high_mean": 0.0013329988105397206, + "clip_ratio/low_mean": 0.0010988753110723337, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002431874083413277, + "epoch": 1.7349081364829395, + "grad_norm": 0.21326813101768494, + "learning_rate": 1e-06, + "loss": -0.0536, + "step": 743 + }, + { + "clip_ratio/high_max": 0.003149357784423046, + "clip_ratio/high_mean": 0.0013002731720916927, + "clip_ratio/low_mean": 0.0012321688736847136, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0025324420566903427, + "epoch": 1.7372411781860602, + "grad_norm": 0.2357005923986435, + "learning_rate": 1e-06, + "loss": -0.0537, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0026869889843510464, + "clip_ratio/high_mean": 0.0009339023745269515, + "clip_ratio/low_mean": 0.00044741597776010167, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013813183722959366, + "completions/clipped_ratio": 0.1350446428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2385.0, + "completions/mean_length": 1078.5848388671875, + "completions/mean_terminated_length": 607.478759765625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 1.7395742198891804, + "grad_norm": 0.3794609606266022, + "learning_rate": 1e-06, + "loss": -0.0508, + "num_tokens": 110908155.0, + "reward": 0.6272321939468384, + "reward_std": 0.16160815954208374, + "rewards/verify_math_reward/mean": 0.6272321343421936, + "rewards/verify_math_reward/std": 0.4838111698627472, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0030135468659864273, + "clip_ratio/high_mean": 0.0011095243444287917, + "clip_ratio/low_mean": 0.000746576579331304, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018561009055702016, + "epoch": 1.741907261592301, + "grad_norm": 0.25194844603538513, + "learning_rate": 1e-06, + "loss": -0.0512, + "step": 746 + }, + { + "clip_ratio/high_max": 0.003536265438015107, + "clip_ratio/high_mean": 0.0012160383303125855, + "clip_ratio/low_mean": 0.0007701759050178225, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019862142580677755, + "epoch": 1.7442403032954215, + "grad_norm": 0.2876931428909302, + "learning_rate": 1e-06, + "loss": -0.0513, + "step": 747 + }, + { + "clip_ratio/high_max": 0.003463274500973057, + "clip_ratio/high_mean": 0.0012149306567152962, + "clip_ratio/low_mean": 0.0010488517327758018, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002263782385853119, + "epoch": 1.7465733449985419, + "grad_norm": 0.21975436806678772, + "learning_rate": 1e-06, + "loss": -0.0515, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0023062533073243685, + "clip_ratio/high_mean": 0.0008066467999015003, + "clip_ratio/low_mean": 0.0005605802763284373, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013672271052200813, + "completions/clipped_ratio": 0.2254464285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3654.0, + "completions/mean_length": 1421.69873046875, + "completions/mean_terminated_length": 643.2996826171875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 1.7489063867016623, + "grad_norm": 0.3322184681892395, + "learning_rate": 1e-06, + "loss": -0.0811, + "num_tokens": 111445621.0, + "reward": 0.53125, + "reward_std": 0.1724012941122055, + "rewards/verify_math_reward/mean": 0.53125, + "rewards/verify_math_reward/std": 0.4993011951446533, + "step": 749 + }, + { + "clip_ratio/high_max": 0.003089862053457182, + "clip_ratio/high_mean": 0.0010738891160144703, + "clip_ratio/low_mean": 0.0006956881975384022, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017695772912702523, + "epoch": 1.7512394284047827, + "grad_norm": 0.3031192421913147, + "learning_rate": 1e-06, + "loss": -0.0814, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0025417940341867507, + "clip_ratio/high_mean": 0.0009615219769329997, + "clip_ratio/low_mean": 0.0009298260465584463, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001891348001663573, + "epoch": 1.7535724701079032, + "grad_norm": 0.21803531050682068, + "learning_rate": 1e-06, + "loss": -0.0816, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0026612037072482053, + "clip_ratio/high_mean": 0.0009760259836184559, + "clip_ratio/low_mean": 0.001015680973068811, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001991706929402426, + "epoch": 1.7559055118110236, + "grad_norm": 0.224185049533844, + "learning_rate": 1e-06, + "loss": -0.0817, + "step": 752 + }, + { + "clip_ratio/high_max": 0.002623805237817578, + "clip_ratio/high_mean": 0.0009534142445772886, + "clip_ratio/low_mean": 0.0004241333190293517, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013775475563306827, + "completions/clipped_ratio": 0.2053571428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4041.0, + "completions/mean_length": 1335.7723388671875, + "completions/mean_terminated_length": 622.455078125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 1.758238553514144, + "grad_norm": 0.30206793546676636, + "learning_rate": 1e-06, + "loss": -0.0629, + "num_tokens": 111985497.0, + "reward": 0.5569196939468384, + "reward_std": 0.1587110459804535, + "rewards/verify_math_reward/mean": 0.5569196343421936, + "rewards/verify_math_reward/std": 0.49702703952789307, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0035839638221659698, + "clip_ratio/high_mean": 0.0013280071470944677, + "clip_ratio/low_mean": 0.0007448215110343881, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020728286472149193, + "epoch": 1.7605715952172645, + "grad_norm": 0.2561754286289215, + "learning_rate": 1e-06, + "loss": -0.0632, + "step": 754 + }, + { + "clip_ratio/high_max": 0.003633407446614001, + "clip_ratio/high_mean": 0.001320830335316714, + "clip_ratio/low_mean": 0.0008764087033341639, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021972390604787506, + "epoch": 1.762904636920385, + "grad_norm": 0.23033028841018677, + "learning_rate": 1e-06, + "loss": -0.0634, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0034799278510035947, + "clip_ratio/high_mean": 0.0012372326200420503, + "clip_ratio/low_mean": 0.0010334521684853826, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022706847521476448, + "epoch": 1.7652376786235053, + "grad_norm": 0.221846804022789, + "learning_rate": 1e-06, + "loss": -0.0634, + "step": 756 + }, + { + "clip_ratio/high_max": 0.002665332518517971, + "clip_ratio/high_mean": 0.0010277658730046824, + "clip_ratio/low_mean": 0.0007625566595379496, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017903225743793882, + "completions/clipped_ratio": 0.1908482142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4020.0, + "completions/mean_length": 1378.73779296875, + "completions/mean_terminated_length": 737.838623046875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 1.767570720326626, + "grad_norm": 0.3178108036518097, + "learning_rate": 1e-06, + "loss": -0.0708, + "num_tokens": 112605214.0, + "reward": 0.5334821939468384, + "reward_std": 0.20185355842113495, + "rewards/verify_math_reward/mean": 0.5334821343421936, + "rewards/verify_math_reward/std": 0.49915632605552673, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0033235828086617403, + "clip_ratio/high_mean": 0.0011530254741956014, + "clip_ratio/low_mean": 0.0010216206592303934, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002174646080675302, + "epoch": 1.7699037620297462, + "grad_norm": 0.26831531524658203, + "learning_rate": 1e-06, + "loss": -0.071, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0035036832850892097, + "clip_ratio/high_mean": 0.0012240288524481002, + "clip_ratio/low_mean": 0.0011186093088326743, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023426382322213612, + "epoch": 1.7722368037328668, + "grad_norm": 0.2287055402994156, + "learning_rate": 1e-06, + "loss": -0.0713, + "step": 759 + }, + { + "clip_ratio/high_max": 0.002821169931848999, + "clip_ratio/high_mean": 0.0010705337044782937, + "clip_ratio/low_mean": 0.0013855133893230231, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024560470701544546, + "epoch": 1.774569845435987, + "grad_norm": 0.2927832305431366, + "learning_rate": 1e-06, + "loss": -0.0713, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0017815241080825217, + "clip_ratio/high_mean": 0.0006108197103458224, + "clip_ratio/low_mean": 0.000504789546084794, + "clip_ratio/low_min": 1.4180373909766786e-05, + "clip_ratio/region_mean": 0.0011156092550663743, + "completions/clipped_ratio": 0.1339285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3427.0, + "completions/mean_length": 1096.298095703125, + "completions/mean_terminated_length": 632.426513671875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 1.7769028871391077, + "grad_norm": 0.28425121307373047, + "learning_rate": 1e-06, + "loss": -0.0622, + "num_tokens": 113186329.0, + "reward": 0.5959821939468384, + "reward_std": 0.14324188232421875, + "rewards/verify_math_reward/mean": 0.5959821343421936, + "rewards/verify_math_reward/std": 0.490975022315979, + "step": 761 + }, + { + "clip_ratio/high_max": 0.002119680393661838, + "clip_ratio/high_mean": 0.0007032340054138331, + "clip_ratio/low_mean": 0.0007346917317363477, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014379257372638676, + "epoch": 1.779235928842228, + "grad_norm": 0.18750505149364471, + "learning_rate": 1e-06, + "loss": -0.0625, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0024135289568221197, + "clip_ratio/high_mean": 0.0008099280639726203, + "clip_ratio/low_mean": 0.0007933280356837713, + "clip_ratio/low_min": 2.127056177414488e-05, + "clip_ratio/region_mean": 0.00160325612523593, + "epoch": 1.7815689705453486, + "grad_norm": 0.24141138792037964, + "learning_rate": 1e-06, + "loss": -0.0626, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0019172006941516884, + "clip_ratio/high_mean": 0.0007264084797498072, + "clip_ratio/low_mean": 0.0009916465546666586, + "clip_ratio/low_min": 2.127056177414488e-05, + "clip_ratio/region_mean": 0.001718055016681319, + "epoch": 1.7839020122484688, + "grad_norm": 0.17011037468910217, + "learning_rate": 1e-06, + "loss": -0.0627, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0029642568333656527, + "clip_ratio/high_mean": 0.0011997792062174994, + "clip_ratio/low_mean": 0.0006717288924846798, + "clip_ratio/low_min": 1.4169122550811153e-05, + "clip_ratio/region_mean": 0.0018715080877882428, + "completions/clipped_ratio": 0.2087053571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3856.0, + "completions/mean_length": 1340.204345703125, + "completions/mean_terminated_length": 613.3582763671875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 1.7862350539515894, + "grad_norm": 0.3163980543613434, + "learning_rate": 1e-06, + "loss": -0.124, + "num_tokens": 113704848.0, + "reward": 0.5424107313156128, + "reward_std": 0.20662352442741394, + "rewards/verify_math_reward/mean": 0.5424107313156128, + "rewards/verify_math_reward/std": 0.4984763264656067, + "step": 765 + }, + { + "clip_ratio/high_max": 0.003829695677268319, + "clip_ratio/high_mean": 0.00148837465167162, + "clip_ratio/low_mean": 0.000826685465654009, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002315060126420576, + "epoch": 1.7885680956547099, + "grad_norm": 0.3120717406272888, + "learning_rate": 1e-06, + "loss": -0.1242, + "step": 766 + }, + { + "clip_ratio/high_max": 0.003974874533014372, + "clip_ratio/high_mean": 0.001538831988000311, + "clip_ratio/low_mean": 0.0011645123013295233, + "clip_ratio/low_min": 2.295262493134942e-05, + "clip_ratio/region_mean": 0.0027033442820538767, + "epoch": 1.7909011373578303, + "grad_norm": 0.26164695620536804, + "learning_rate": 1e-06, + "loss": -0.1245, + "step": 767 + }, + { + "clip_ratio/high_max": 0.003672715203720145, + "clip_ratio/high_mean": 0.001413390549714677, + "clip_ratio/low_mean": 0.0012837811300414614, + "clip_ratio/low_min": 4.620502659236081e-05, + "clip_ratio/region_mean": 0.0026971717015840113, + "epoch": 1.7932341790609507, + "grad_norm": 0.2773039638996124, + "learning_rate": 1e-06, + "loss": -0.1245, + "step": 768 + }, + { + "clip_ratio/high_max": 0.002676884803804569, + "clip_ratio/high_mean": 0.0009917986153595848, + "clip_ratio/low_mean": 0.0005035521880927263, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014953507634345442, + "completions/clipped_ratio": 0.2142857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3903.0, + "completions/mean_length": 1422.67529296875, + "completions/mean_terminated_length": 693.586669921875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 1.7955672207640712, + "grad_norm": 0.3097780644893646, + "learning_rate": 1e-06, + "loss": -0.1124, + "num_tokens": 114285629.0, + "reward": 0.4765625298023224, + "reward_std": 0.16796325147151947, + "rewards/verify_math_reward/mean": 0.4765625, + "rewards/verify_math_reward/std": 0.49972933530807495, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0027745560291805305, + "clip_ratio/high_mean": 0.0010920167151198257, + "clip_ratio/low_mean": 0.0006124778379898999, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017044945670932066, + "epoch": 1.7979002624671916, + "grad_norm": 0.24015967547893524, + "learning_rate": 1e-06, + "loss": -0.1124, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0031230877866619267, + "clip_ratio/high_mean": 0.0010731148413469782, + "clip_ratio/low_mean": 0.0008133408282446908, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001886455633211881, + "epoch": 1.800233304170312, + "grad_norm": 0.20473533868789673, + "learning_rate": 1e-06, + "loss": -0.1127, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0028612736641662195, + "clip_ratio/high_mean": 0.0010856566805159673, + "clip_ratio/low_mean": 0.0009701266862975899, + "clip_ratio/low_min": 1.3994625987834297e-05, + "clip_ratio/region_mean": 0.002055783334071748, + "epoch": 1.8025663458734325, + "grad_norm": 0.21435610949993134, + "learning_rate": 1e-06, + "loss": -0.1127, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0022681507507513743, + "clip_ratio/high_mean": 0.0008565066564187873, + "clip_ratio/low_mean": 0.0005312314214052094, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013877380770281889, + "completions/clipped_ratio": 0.1540178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4023.0, + "completions/mean_length": 1162.469970703125, + "completions/mean_terminated_length": 628.3970947265625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 1.8048993875765529, + "grad_norm": 0.3202595114707947, + "learning_rate": 1e-06, + "loss": -0.0719, + "num_tokens": 114847410.0, + "reward": 0.5758928656578064, + "reward_std": 0.15744182467460632, + "rewards/verify_math_reward/mean": 0.5758928656578064, + "rewards/verify_math_reward/std": 0.49448275566101074, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0029461818994604982, + "clip_ratio/high_mean": 0.0010305059331585653, + "clip_ratio/low_mean": 0.0007527953118824371, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001783301260729786, + "epoch": 1.8072324292796735, + "grad_norm": 0.24247294664382935, + "learning_rate": 1e-06, + "loss": -0.0721, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0029159980040276423, + "clip_ratio/high_mean": 0.0010033698526967783, + "clip_ratio/low_mean": 0.0007841212382118101, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017874911063699983, + "epoch": 1.8095654709827937, + "grad_norm": 0.19409358501434326, + "learning_rate": 1e-06, + "loss": -0.0722, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0026246054694638588, + "clip_ratio/high_mean": 0.0009365742735099047, + "clip_ratio/low_mean": 0.0010685592246773012, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020051334613526706, + "epoch": 1.8118985126859144, + "grad_norm": 0.21823573112487793, + "learning_rate": 1e-06, + "loss": -0.0723, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0020401036454131827, + "clip_ratio/high_mean": 0.0008350547459485824, + "clip_ratio/low_mean": 0.0006044626568382228, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014395173820958007, + "completions/clipped_ratio": 0.1651785714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3060.0, + "completions/mean_length": 1199.04248046875, + "completions/mean_terminated_length": 625.8475952148438, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 1.8142315543890346, + "grad_norm": 0.2843477725982666, + "learning_rate": 1e-06, + "loss": -0.0601, + "num_tokens": 115399928.0, + "reward": 0.5368303656578064, + "reward_std": 0.1537223905324936, + "rewards/verify_math_reward/mean": 0.5368303656578064, + "rewards/verify_math_reward/std": 0.49892017245292664, + "step": 777 + }, + { + "clip_ratio/high_max": 0.002548430689785164, + "clip_ratio/high_mean": 0.0010934012916550273, + "clip_ratio/low_mean": 0.0008658578972244868, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019592591634136625, + "epoch": 1.8165645960921553, + "grad_norm": 0.24737218022346497, + "learning_rate": 1e-06, + "loss": -0.0603, + "step": 778 + }, + { + "clip_ratio/high_max": 0.002447107202897314, + "clip_ratio/high_mean": 0.001076182052202057, + "clip_ratio/low_mean": 0.0011107966804502212, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002186978621466551, + "epoch": 1.8188976377952755, + "grad_norm": 0.21529598534107208, + "learning_rate": 1e-06, + "loss": -0.0605, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0026493881123315077, + "clip_ratio/high_mean": 0.001074210747901816, + "clip_ratio/low_mean": 0.001178176313487711, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022523870793520473, + "epoch": 1.8212306794983961, + "grad_norm": 0.22326554358005524, + "learning_rate": 1e-06, + "loss": -0.0605, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0021476892434293404, + "clip_ratio/high_mean": 0.0007788121292833239, + "clip_ratio/low_mean": 0.00047618040298402775, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012549925268103834, + "completions/clipped_ratio": 0.1584821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3923.0, + "completions/mean_length": 1207.443115234375, + "completions/mean_terminated_length": 663.4442749023438, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 1.8235637212015163, + "grad_norm": 0.27336016297340393, + "learning_rate": 1e-06, + "loss": -0.1015, + "num_tokens": 115983037.0, + "reward": 0.5870535969734192, + "reward_std": 0.16326561570167542, + "rewards/verify_math_reward/mean": 0.5870535969734192, + "rewards/verify_math_reward/std": 0.49263837933540344, + "step": 781 + }, + { + "clip_ratio/high_max": 0.002120119155733846, + "clip_ratio/high_mean": 0.0009483648700552294, + "clip_ratio/low_mean": 0.000684274284139974, + "clip_ratio/low_min": 1.6382700778194703e-05, + "clip_ratio/region_mean": 0.0016326391778420657, + "epoch": 1.825896762904637, + "grad_norm": 0.19790448248386383, + "learning_rate": 1e-06, + "loss": -0.1017, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0025395368284080178, + "clip_ratio/high_mean": 0.0008975396103778621, + "clip_ratio/low_mean": 0.0007989680589162163, + "clip_ratio/low_min": 3.2765401556389406e-05, + "clip_ratio/region_mean": 0.0016965076647466049, + "epoch": 1.8282298046077574, + "grad_norm": 0.18685676157474518, + "learning_rate": 1e-06, + "loss": -0.1018, + "step": 783 + }, + { + "clip_ratio/high_max": 0.002357502049562754, + "clip_ratio/high_mean": 0.000903857222510851, + "clip_ratio/low_mean": 0.0008553564457542961, + "clip_ratio/low_min": 2.734033296292182e-05, + "clip_ratio/region_mean": 0.001759213650075253, + "epoch": 1.8305628463108778, + "grad_norm": 0.22967809438705444, + "learning_rate": 1e-06, + "loss": -0.1018, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0024036451359279454, + "clip_ratio/high_mean": 0.000801718235379667, + "clip_ratio/low_mean": 0.0007686801855015801, + "clip_ratio/low_min": 1.05645704024937e-05, + "clip_ratio/region_mean": 0.0015703984281572048, + "completions/clipped_ratio": 0.2120535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3896.0, + "completions/mean_length": 1436.62841796875, + "completions/mean_terminated_length": 720.9334106445312, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 1.8328958880139983, + "grad_norm": 0.35223352909088135, + "learning_rate": 1e-06, + "loss": -0.0373, + "num_tokens": 116592824.0, + "reward": 0.4598214626312256, + "reward_std": 0.1575952023267746, + "rewards/verify_math_reward/mean": 0.4598214328289032, + "rewards/verify_math_reward/std": 0.4986613988876343, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0029588468387373723, + "clip_ratio/high_mean": 0.0009684900815045694, + "clip_ratio/low_mean": 0.0009765402992343297, + "clip_ratio/low_min": 3.16937112074811e-05, + "clip_ratio/region_mean": 0.0019450303734629415, + "epoch": 1.8352289297171187, + "grad_norm": 0.2443126142024994, + "learning_rate": 1e-06, + "loss": -0.0374, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0035525073762983084, + "clip_ratio/high_mean": 0.0011505296388349961, + "clip_ratio/low_mean": 0.0012322973689151695, + "clip_ratio/low_min": 1.05645704024937e-05, + "clip_ratio/region_mean": 0.0023828270568628795, + "epoch": 1.8375619714202391, + "grad_norm": 0.2040264904499054, + "learning_rate": 1e-06, + "loss": -0.0377, + "step": 787 + }, + { + "clip_ratio/high_max": 0.002937351026048418, + "clip_ratio/high_mean": 0.001020425286696991, + "clip_ratio/low_mean": 0.0013868755704606883, + "clip_ratio/low_min": 3.16937112074811e-05, + "clip_ratio/region_mean": 0.002407300882623531, + "epoch": 1.8398950131233596, + "grad_norm": 0.22036604583263397, + "learning_rate": 1e-06, + "loss": -0.0377, + "step": 788 + }, + { + "clip_ratio/high_max": 0.00202498081853264, + "clip_ratio/high_mean": 0.0006558133309226832, + "clip_ratio/low_mean": 0.0005988032216919237, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012546165671665221, + "completions/clipped_ratio": 0.1908482142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2963.0, + "completions/mean_length": 1309.7935791015625, + "completions/mean_terminated_length": 652.6331176757812, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 1.84222805482648, + "grad_norm": 0.3399260342121124, + "learning_rate": 1e-06, + "loss": -0.0444, + "num_tokens": 117161519.0, + "reward": 0.4754464626312256, + "reward_std": 0.13177865743637085, + "rewards/verify_math_reward/mean": 0.4754464328289032, + "rewards/verify_math_reward/std": 0.4996756315231323, + "step": 789 + }, + { + "clip_ratio/high_max": 0.002574367510533193, + "clip_ratio/high_mean": 0.0008615906726845424, + "clip_ratio/low_mean": 0.0006940032471902668, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001555593920784304, + "epoch": 1.8445610965296004, + "grad_norm": 0.22516852617263794, + "learning_rate": 1e-06, + "loss": -0.0446, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0027376520592952147, + "clip_ratio/high_mean": 0.0008576442141929874, + "clip_ratio/low_mean": 0.0009341048044007039, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017917489458341151, + "epoch": 1.8468941382327209, + "grad_norm": 0.19522975385189056, + "learning_rate": 1e-06, + "loss": -0.0448, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0026930035164696164, + "clip_ratio/high_mean": 0.0008588131131546106, + "clip_ratio/low_mean": 0.0011109570696135052, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00196977020095801, + "epoch": 1.8492271799358413, + "grad_norm": 0.2082953304052353, + "learning_rate": 1e-06, + "loss": -0.0449, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0021975159979774617, + "clip_ratio/high_mean": 0.0010656295926310122, + "clip_ratio/low_mean": 0.0006331684171527741, + "clip_ratio/low_min": 2.886836045945529e-05, + "clip_ratio/region_mean": 0.0016987979979603551, + "completions/clipped_ratio": 0.1662946428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4013.0, + "completions/mean_length": 1188.33935546875, + "completions/mean_terminated_length": 608.3641357421875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 1.851560221638962, + "grad_norm": 0.3365528881549835, + "learning_rate": 1e-06, + "loss": -0.0678, + "num_tokens": 117696535.0, + "reward": 0.5558035969734192, + "reward_std": 0.19989821314811707, + "rewards/verify_math_reward/mean": 0.5558035969734192, + "rewards/verify_math_reward/std": 0.49715372920036316, + "step": 793 + }, + { + "clip_ratio/high_max": 0.003083037110627629, + "clip_ratio/high_mean": 0.0013537398081098218, + "clip_ratio/low_mean": 0.0009749114942678716, + "clip_ratio/low_min": 1.9500779671943747e-05, + "clip_ratio/region_mean": 0.0023286512951017357, + "epoch": 1.8538932633420822, + "grad_norm": 0.32683202624320984, + "learning_rate": 1e-06, + "loss": -0.0681, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0028669562161667272, + "clip_ratio/high_mean": 0.0013689995339518646, + "clip_ratio/low_mean": 0.0012241506410646252, + "clip_ratio/low_min": 1.9500779671943747e-05, + "clip_ratio/region_mean": 0.002593150180473458, + "epoch": 1.8562263050452028, + "grad_norm": 0.25984010100364685, + "learning_rate": 1e-06, + "loss": -0.0683, + "step": 795 + }, + { + "clip_ratio/high_max": 0.003130484779831022, + "clip_ratio/high_mean": 0.0012664634305110667, + "clip_ratio/low_mean": 0.0014365567985805683, + "clip_ratio/low_min": 1.9500779671943747e-05, + "clip_ratio/region_mean": 0.0027030202036257833, + "epoch": 1.858559346748323, + "grad_norm": 0.25461524724960327, + "learning_rate": 1e-06, + "loss": -0.0684, + "step": 796 + }, + { + "clip_ratio/high_max": 0.002819614404870663, + "clip_ratio/high_mean": 0.0010433404531795532, + "clip_ratio/low_mean": 0.000562692410312593, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016060328634921461, + "completions/clipped_ratio": 0.1640625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3896.0, + "completions/mean_length": 1210.8660888671875, + "completions/mean_terminated_length": 644.6248779296875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 1.8608923884514437, + "grad_norm": 0.33243247866630554, + "learning_rate": 1e-06, + "loss": -0.0822, + "num_tokens": 118270751.0, + "reward": 0.5625, + "reward_std": 0.18307287991046906, + "rewards/verify_math_reward/mean": 0.5625, + "rewards/verify_math_reward/std": 0.49635544419288635, + "step": 797 + }, + { + "clip_ratio/high_max": 0.003020628049853258, + "clip_ratio/high_mean": 0.0011996221910521854, + "clip_ratio/low_mean": 0.0007609488329762826, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001960570953087881, + "epoch": 1.8632254301545639, + "grad_norm": 0.2611001431941986, + "learning_rate": 1e-06, + "loss": -0.0824, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0030056221075938083, + "clip_ratio/high_mean": 0.0012223275007272605, + "clip_ratio/low_mean": 0.0008960413233580766, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021183688295423053, + "epoch": 1.8655584718576845, + "grad_norm": 0.22762948274612427, + "learning_rate": 1e-06, + "loss": -0.0826, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0030369122760021128, + "clip_ratio/high_mean": 0.0011457976415840676, + "clip_ratio/low_mean": 0.0011843351348943543, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023301327382796444, + "epoch": 1.8678915135608047, + "grad_norm": 0.22786091268062592, + "learning_rate": 1e-06, + "loss": -0.0827, + "step": 800 + }, + { + "clip_ratio/high_max": 0.002827204138156958, + "clip_ratio/high_mean": 0.0011934108260902576, + "clip_ratio/low_mean": 0.0006371737217705231, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018305845296708867, + "completions/clipped_ratio": 0.1618303571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3696.0, + "completions/mean_length": 1183.9476318359375, + "completions/mean_terminated_length": 621.7003784179688, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 1.8702245552639254, + "grad_norm": 0.3000062108039856, + "learning_rate": 1e-06, + "loss": -0.0767, + "num_tokens": 118818264.0, + "reward": 0.574776828289032, + "reward_std": 0.1907682567834854, + "rewards/verify_math_reward/mean": 0.5747767686843872, + "rewards/verify_math_reward/std": 0.49465295672416687, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0032071580208139494, + "clip_ratio/high_mean": 0.001341705417871708, + "clip_ratio/low_mean": 0.0009606630992493592, + "clip_ratio/low_min": 4.161653941991972e-05, + "clip_ratio/region_mean": 0.002302368484379258, + "epoch": 1.8725575969670458, + "grad_norm": 0.2800460457801819, + "learning_rate": 1e-06, + "loss": -0.077, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0035844488375005312, + "clip_ratio/high_mean": 0.001533605121949222, + "clip_ratio/low_mean": 0.0011396265435905661, + "clip_ratio/low_min": 1.4282450138125569e-05, + "clip_ratio/region_mean": 0.002673231625522021, + "epoch": 1.8748906386701663, + "grad_norm": 0.25125962495803833, + "learning_rate": 1e-06, + "loss": -0.0772, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0033888213947648183, + "clip_ratio/high_mean": 0.0013532129996747244, + "clip_ratio/low_mean": 0.001306935651882668, + "clip_ratio/low_min": 4.2847350414376706e-05, + "clip_ratio/region_mean": 0.0026601486752042547, + "epoch": 1.8772236803732867, + "grad_norm": 0.2604830265045166, + "learning_rate": 1e-06, + "loss": -0.0773, + "step": 804 + }, + { + "clip_ratio/high_max": 0.002367687178775668, + "clip_ratio/high_mean": 0.000912133504243684, + "clip_ratio/low_mean": 0.0005989227101963479, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015110562089830637, + "completions/clipped_ratio": 0.1741071428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3643.0, + "completions/mean_length": 1275.75341796875, + "completions/mean_terminated_length": 681.2149047851562, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 1.8795567220764071, + "grad_norm": 0.28715670108795166, + "learning_rate": 1e-06, + "loss": -0.0577, + "num_tokens": 119402443.0, + "reward": 0.5457589626312256, + "reward_std": 0.16570734977722168, + "rewards/verify_math_reward/mean": 0.5457589030265808, + "rewards/verify_math_reward/std": 0.4981798231601715, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0029728993104072288, + "clip_ratio/high_mean": 0.001111834862967953, + "clip_ratio/low_mean": 0.0007670903178222943, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018789252208080143, + "epoch": 1.8818897637795275, + "grad_norm": 0.21860335767269135, + "learning_rate": 1e-06, + "loss": -0.058, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0027008577017113566, + "clip_ratio/high_mean": 0.0010888749711739365, + "clip_ratio/low_mean": 0.0010045799008366885, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002093454902933445, + "epoch": 1.884222805482648, + "grad_norm": 0.21858292818069458, + "learning_rate": 1e-06, + "loss": -0.058, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0027396525474614464, + "clip_ratio/high_mean": 0.0010619849563227035, + "clip_ratio/low_mean": 0.0010708690479077632, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002132853987859562, + "epoch": 1.8865558471857684, + "grad_norm": 0.20753753185272217, + "learning_rate": 1e-06, + "loss": -0.0581, + "step": 808 + }, + { + "clip_ratio/high_max": 0.002918562593549723, + "clip_ratio/high_mean": 0.0010963252407236723, + "clip_ratio/low_mean": 0.000893263903890329, + "clip_ratio/low_min": 1.9797276763711125e-05, + "clip_ratio/region_mean": 0.0019895891564374324, + "completions/clipped_ratio": 0.2053571428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3374.0, + "completions/mean_length": 1344.7310791015625, + "completions/mean_terminated_length": 633.7289428710938, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 1.8888888888888888, + "grad_norm": 0.34875476360321045, + "learning_rate": 1e-06, + "loss": -0.0851, + "num_tokens": 119945450.0, + "reward": 0.4888392984867096, + "reward_std": 0.19268713891506195, + "rewards/verify_math_reward/mean": 0.4888392984867096, + "rewards/verify_math_reward/std": 0.5001546144485474, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0038939863807172514, + "clip_ratio/high_mean": 0.0013524092973966617, + "clip_ratio/low_mean": 0.0010970616106078523, + "clip_ratio/low_min": 3.1151354050962254e-05, + "clip_ratio/region_mean": 0.0024494708195561543, + "epoch": 1.8912219305920095, + "grad_norm": 0.3230593204498291, + "learning_rate": 1e-06, + "loss": -0.0852, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0036844285641564056, + "clip_ratio/high_mean": 0.0014590889877581503, + "clip_ratio/low_mean": 0.0015696788232162362, + "clip_ratio/low_min": 1.9797276763711125e-05, + "clip_ratio/region_mean": 0.003028767809155397, + "epoch": 1.8935549722951297, + "grad_norm": 0.2665470540523529, + "learning_rate": 1e-06, + "loss": -0.0857, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0036328440000943374, + "clip_ratio/high_mean": 0.0013039033910899889, + "clip_ratio/low_mean": 0.0017443379783799173, + "clip_ratio/low_min": 2.9695913326577283e-05, + "clip_ratio/region_mean": 0.003048241422220599, + "epoch": 1.8958880139982504, + "grad_norm": 0.25983041524887085, + "learning_rate": 1e-06, + "loss": -0.0857, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0025541995419189334, + "clip_ratio/high_mean": 0.0010109552877111128, + "clip_ratio/low_mean": 0.0006645241810474545, + "clip_ratio/low_min": 1.4460897546086926e-05, + "clip_ratio/region_mean": 0.0016754794851294719, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2481.0, + "completions/mean_length": 1209.9788818359375, + "completions/mean_terminated_length": 675.5303955078125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 1.8982210557013706, + "grad_norm": 0.2932416796684265, + "learning_rate": 1e-06, + "loss": -0.0607, + "num_tokens": 120553103.0, + "reward": 0.5502232313156128, + "reward_std": 0.15782341361045837, + "rewards/verify_math_reward/mean": 0.5502232313156128, + "rewards/verify_math_reward/std": 0.49774909019470215, + "step": 813 + }, + { + "clip_ratio/high_max": 0.003172433156578336, + "clip_ratio/high_mean": 0.0012148804053140339, + "clip_ratio/low_mean": 0.0009071936674445169, + "clip_ratio/low_min": 2.6595744202495553e-05, + "clip_ratio/region_mean": 0.002122074060025625, + "epoch": 1.9005540974044912, + "grad_norm": 0.23612117767333984, + "learning_rate": 1e-06, + "loss": -0.0609, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0029380835840129294, + "clip_ratio/high_mean": 0.001169846629636595, + "clip_ratio/low_mean": 0.0010059440137411002, + "clip_ratio/low_min": 2.4418832254013978e-05, + "clip_ratio/region_mean": 0.0021757906433776952, + "epoch": 1.9028871391076114, + "grad_norm": 0.2363535314798355, + "learning_rate": 1e-06, + "loss": -0.061, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0032878915008041076, + "clip_ratio/high_mean": 0.0012400051637087017, + "clip_ratio/low_mean": 0.0012415621458785608, + "clip_ratio/low_min": 2.8921795092173852e-05, + "clip_ratio/region_mean": 0.0024815672513796017, + "epoch": 1.905220180810732, + "grad_norm": 0.22312773764133453, + "learning_rate": 1e-06, + "loss": -0.0612, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0027170163157279603, + "clip_ratio/high_mean": 0.0010297121934854658, + "clip_ratio/low_mean": 0.0006148753263914841, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016445875189674553, + "completions/clipped_ratio": 0.1618303571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3367.0, + "completions/mean_length": 1220.407470703125, + "completions/mean_terminated_length": 665.19970703125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 1.9075532225138523, + "grad_norm": 0.30903011560440063, + "learning_rate": 1e-06, + "loss": -0.078, + "num_tokens": 121138484.0, + "reward": 0.5401785969734192, + "reward_std": 0.17543968558311462, + "rewards/verify_math_reward/mean": 0.5401785969734192, + "rewards/verify_math_reward/std": 0.49866142868995667, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0035944295013905503, + "clip_ratio/high_mean": 0.0012570860781124793, + "clip_ratio/low_mean": 0.0008134864974636002, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002070572561933659, + "epoch": 1.909886264216973, + "grad_norm": 0.2809344232082367, + "learning_rate": 1e-06, + "loss": -0.0782, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0034634525945875794, + "clip_ratio/high_mean": 0.0012651153083425015, + "clip_ratio/low_mean": 0.0009896510273392778, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002254766382975504, + "epoch": 1.9122193059200934, + "grad_norm": 0.2349781095981598, + "learning_rate": 1e-06, + "loss": -0.0785, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0031610069345333613, + "clip_ratio/high_mean": 0.0012248718412593007, + "clip_ratio/low_mean": 0.0012135470769862877, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002438418916426599, + "epoch": 1.9145523476232138, + "grad_norm": 0.24953652918338776, + "learning_rate": 1e-06, + "loss": -0.0786, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0028672609259956516, + "clip_ratio/high_mean": 0.0011765829149226192, + "clip_ratio/low_mean": 0.0007742787074676016, + "clip_ratio/low_min": 1.4405900401470717e-05, + "clip_ratio/region_mean": 0.0019508616242092103, + "completions/clipped_ratio": 0.1674107142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3906.0, + "completions/mean_length": 1300.1976318359375, + "completions/mean_terminated_length": 738.0388793945312, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 1.9168853893263342, + "grad_norm": 0.30159497261047363, + "learning_rate": 1e-06, + "loss": -0.0721, + "num_tokens": 121768685.0, + "reward": 0.5089285969734192, + "reward_std": 0.21560588479042053, + "rewards/verify_math_reward/mean": 0.5089285969734192, + "rewards/verify_math_reward/std": 0.5001994967460632, + "step": 821 + }, + { + "clip_ratio/high_max": 0.002927249559434131, + "clip_ratio/high_mean": 0.0013193048471293878, + "clip_ratio/low_mean": 0.0009674110351625131, + "clip_ratio/low_min": 9.32696639210917e-06, + "clip_ratio/region_mean": 0.00228671585500706, + "epoch": 1.9192184310294547, + "grad_norm": 0.2845073342323303, + "learning_rate": 1e-06, + "loss": -0.0723, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0033219540782738477, + "clip_ratio/high_mean": 0.0013041101992712356, + "clip_ratio/low_mean": 0.0011976933037658455, + "clip_ratio/low_min": 2.798089917632751e-05, + "clip_ratio/region_mean": 0.002501803515770007, + "epoch": 1.921551472732575, + "grad_norm": 0.23670266568660736, + "learning_rate": 1e-06, + "loss": -0.0726, + "step": 823 + }, + { + "clip_ratio/high_max": 0.003028897750482429, + "clip_ratio/high_mean": 0.0012568753845698666, + "clip_ratio/low_mean": 0.001436265232769074, + "clip_ratio/low_min": 2.798089917632751e-05, + "clip_ratio/region_mean": 0.002693140646442771, + "epoch": 1.9238845144356955, + "grad_norm": 0.24771694839000702, + "learning_rate": 1e-06, + "loss": -0.0727, + "step": 824 + }, + { + "clip_ratio/high_max": 0.002573345904238522, + "clip_ratio/high_mean": 0.0009996045755542582, + "clip_ratio/low_mean": 0.0005916766449445277, + "clip_ratio/low_min": 1.4654161532234866e-05, + "clip_ratio/region_mean": 0.0015912812450551428, + "completions/clipped_ratio": 0.1863839285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3848.0, + "completions/mean_length": 1320.219970703125, + "completions/mean_terminated_length": 684.341552734375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 1.926217556138816, + "grad_norm": 0.3412525951862335, + "learning_rate": 1e-06, + "loss": -0.0693, + "num_tokens": 122361018.0, + "reward": 0.5703125, + "reward_std": 0.16457942128181458, + "rewards/verify_math_reward/mean": 0.5703125, + "rewards/verify_math_reward/std": 0.49530795216560364, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0028175055631436408, + "clip_ratio/high_mean": 0.0011196003197255777, + "clip_ratio/low_mean": 0.0007795236651872983, + "clip_ratio/low_min": 2.9308323064469732e-05, + "clip_ratio/region_mean": 0.0018991239558090456, + "epoch": 1.9285505978419364, + "grad_norm": 0.3357429504394531, + "learning_rate": 1e-06, + "loss": -0.0695, + "step": 826 + }, + { + "clip_ratio/high_max": 0.002739876370469574, + "clip_ratio/high_mean": 0.0010583981857053004, + "clip_ratio/low_mean": 0.0008846381160765304, + "clip_ratio/low_min": 3.643252784968354e-05, + "clip_ratio/region_mean": 0.0019430363063293044, + "epoch": 1.9308836395450568, + "grad_norm": 0.22380100190639496, + "learning_rate": 1e-06, + "loss": -0.0697, + "step": 827 + }, + { + "clip_ratio/high_max": 0.002658689318195684, + "clip_ratio/high_mean": 0.001039948532707058, + "clip_ratio/low_mean": 0.001099562708986923, + "clip_ratio/low_min": 7.147962605813518e-05, + "clip_ratio/region_mean": 0.0021395112489699386, + "epoch": 1.9332166812481772, + "grad_norm": 0.22586019337177277, + "learning_rate": 1e-06, + "loss": -0.0698, + "step": 828 + }, + { + "clip_ratio/high_max": 0.002943986553873401, + "clip_ratio/high_mean": 0.0011864731186506106, + "clip_ratio/low_mean": 0.0005792369875052827, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001765710097970441, + "completions/clipped_ratio": 0.1930803571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2665.0, + "completions/mean_length": 1342.927490234375, + "completions/mean_terminated_length": 684.170166015625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 1.935549722951298, + "grad_norm": 0.3448341190814972, + "learning_rate": 1e-06, + "loss": -0.1174, + "num_tokens": 122946665.0, + "reward": 0.5301339626312256, + "reward_std": 0.21578949689865112, + "rewards/verify_math_reward/mean": 0.5301339030265808, + "rewards/verify_math_reward/std": 0.49936985969543457, + "step": 829 + }, + { + "clip_ratio/high_max": 0.003517591430863831, + "clip_ratio/high_mean": 0.001492794162913924, + "clip_ratio/low_mean": 0.0008221051957661984, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002314899320481345, + "epoch": 1.937882764654418, + "grad_norm": 0.2709249258041382, + "learning_rate": 1e-06, + "loss": -0.1177, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0034801322108251043, + "clip_ratio/high_mean": 0.0015445070421264973, + "clip_ratio/low_mean": 0.0010751401532616, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026196471881121397, + "epoch": 1.9402158063575388, + "grad_norm": 0.22620545327663422, + "learning_rate": 1e-06, + "loss": -0.118, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0033398918312741444, + "clip_ratio/high_mean": 0.0014138497790554538, + "clip_ratio/low_mean": 0.0012099658979423111, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026238156788167544, + "epoch": 1.942548848060659, + "grad_norm": 0.2483496516942978, + "learning_rate": 1e-06, + "loss": -0.118, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0026773650461109355, + "clip_ratio/high_mean": 0.0010678252874640748, + "clip_ratio/low_mean": 0.00047232585620804457, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015401511409436353, + "completions/clipped_ratio": 0.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3823.0, + "completions/mean_length": 1023.07373046875, + "completions/mean_terminated_length": 584.0841674804688, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 1.9448818897637796, + "grad_norm": 0.35285845398902893, + "learning_rate": 1e-06, + "loss": -0.0493, + "num_tokens": 123489811.0, + "reward": 0.6495535969734192, + "reward_std": 0.18645039200782776, + "rewards/verify_math_reward/mean": 0.6495535969734192, + "rewards/verify_math_reward/std": 0.477376252412796, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0034253023914061487, + "clip_ratio/high_mean": 0.0013965224497951567, + "clip_ratio/low_mean": 0.0007195341222541174, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021160565956961364, + "epoch": 1.9472149314668998, + "grad_norm": 0.2906281352043152, + "learning_rate": 1e-06, + "loss": -0.0498, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0034584589520818554, + "clip_ratio/high_mean": 0.001384539864375256, + "clip_ratio/low_mean": 0.0009951573447324336, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023796972673153505, + "epoch": 1.9495479731700205, + "grad_norm": 0.22655533254146576, + "learning_rate": 1e-06, + "loss": -0.05, + "step": 835 + }, + { + "clip_ratio/high_max": 0.002966634900076315, + "clip_ratio/high_mean": 0.0012003569754597265, + "clip_ratio/low_mean": 0.0012030265734210843, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00240338352159597, + "epoch": 1.9518810148731407, + "grad_norm": 0.25467953085899353, + "learning_rate": 1e-06, + "loss": -0.05, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0023573459111503325, + "clip_ratio/high_mean": 0.0008272162631328683, + "clip_ratio/low_mean": 0.0009178584023175063, + "clip_ratio/low_min": 0.0001526445712443092, + "clip_ratio/region_mean": 0.0017450746599934064, + "completions/clipped_ratio": 0.1607142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3511.0, + "completions/mean_length": 1196.7410888671875, + "completions/mean_terminated_length": 641.5637817382812, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 1.9542140565762613, + "grad_norm": 0.3911133110523224, + "learning_rate": 1e-06, + "loss": -0.0449, + "num_tokens": 124054891.0, + "reward": 0.6004464626312256, + "reward_std": 0.1792774647474289, + "rewards/verify_math_reward/mean": 0.6004464030265808, + "rewards/verify_math_reward/std": 0.49008017778396606, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0034634961775736883, + "clip_ratio/high_mean": 0.0011457822802185547, + "clip_ratio/low_mean": 0.0012875593711214606, + "clip_ratio/low_min": 9.394089011038886e-05, + "clip_ratio/region_mean": 0.002433341673167888, + "epoch": 1.9565470982793818, + "grad_norm": 0.32581472396850586, + "learning_rate": 1e-06, + "loss": -0.0454, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0031172664603218436, + "clip_ratio/high_mean": 0.0010698406240408076, + "clip_ratio/low_mean": 0.001597761751327198, + "clip_ratio/low_min": 0.00014222434037947096, + "clip_ratio/region_mean": 0.0026676024426706135, + "epoch": 1.9588801399825022, + "grad_norm": 0.2666594386100769, + "learning_rate": 1e-06, + "loss": -0.0456, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0027453268921817653, + "clip_ratio/high_mean": 0.0009260922197427135, + "clip_ratio/low_mean": 0.002023709843342658, + "clip_ratio/low_min": 0.00011088754854426952, + "clip_ratio/region_mean": 0.0029498021976905875, + "epoch": 1.9612131816856226, + "grad_norm": 0.2504991292953491, + "learning_rate": 1e-06, + "loss": -0.0457, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0022448038835136686, + "clip_ratio/high_mean": 0.0008656120189698413, + "clip_ratio/low_mean": 0.0005494064107551822, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014150184106256347, + "completions/clipped_ratio": 0.1383928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3817.0, + "completions/mean_length": 1129.015625, + "completions/mean_terminated_length": 652.453369140625, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 1.963546223388743, + "grad_norm": 0.2795056998729706, + "learning_rate": 1e-06, + "loss": -0.0576, + "num_tokens": 124648505.0, + "reward": 0.578125, + "reward_std": 0.1662386953830719, + "rewards/verify_math_reward/mean": 0.578125, + "rewards/verify_math_reward/std": 0.4941346049308777, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0024054357418208383, + "clip_ratio/high_mean": 0.0010389593844593037, + "clip_ratio/low_mean": 0.000745671023196337, + "clip_ratio/low_min": 1.4022884897713084e-05, + "clip_ratio/region_mean": 0.0017846304181148298, + "epoch": 1.9658792650918635, + "grad_norm": 0.22816675901412964, + "learning_rate": 1e-06, + "loss": -0.0577, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0024598441887064837, + "clip_ratio/high_mean": 0.001023146014631493, + "clip_ratio/low_mean": 0.0008784550973359728, + "clip_ratio/low_min": 1.4022884897713084e-05, + "clip_ratio/region_mean": 0.0019016010846826248, + "epoch": 1.968212306794984, + "grad_norm": 0.21649332344532013, + "learning_rate": 1e-06, + "loss": -0.0579, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0024888858024496585, + "clip_ratio/high_mean": 0.0009545433949824655, + "clip_ratio/low_mean": 0.0010246585416098242, + "clip_ratio/low_min": 1.4022884897713084e-05, + "clip_ratio/region_mean": 0.0019792019520536996, + "epoch": 1.9705453484981044, + "grad_norm": 0.2124703973531723, + "learning_rate": 1e-06, + "loss": -0.058, + "step": 844 + }, + { + "clip_ratio/high_max": 0.002497881134331692, + "clip_ratio/high_mean": 0.0009816796955419704, + "clip_ratio/low_mean": 0.00041495259642942983, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013966322985652369, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3616.0, + "completions/mean_length": 1204.2154541015625, + "completions/mean_terminated_length": 604.03369140625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 1.9728783902012248, + "grad_norm": 0.33216366171836853, + "learning_rate": 1e-06, + "loss": -0.1035, + "num_tokens": 125181234.0, + "reward": 0.5948660969734192, + "reward_std": 0.16175968945026398, + "rewards/verify_math_reward/mean": 0.5948660969734192, + "rewards/verify_math_reward/std": 0.49119213223457336, + "step": 845 + }, + { + "clip_ratio/high_max": 0.003344422126247082, + "clip_ratio/high_mean": 0.001384327799314633, + "clip_ratio/low_mean": 0.0007072462321957573, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020915740460623056, + "epoch": 1.9752114319043454, + "grad_norm": 0.2383900135755539, + "learning_rate": 1e-06, + "loss": -0.1038, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0031971252901712433, + "clip_ratio/high_mean": 0.0012845905948779546, + "clip_ratio/low_mean": 0.0008054647610151733, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020900553645333275, + "epoch": 1.9775444736074657, + "grad_norm": 0.21354363858699799, + "learning_rate": 1e-06, + "loss": -0.104, + "step": 847 + }, + { + "clip_ratio/high_max": 0.003370238082425203, + "clip_ratio/high_mean": 0.0012448482084437273, + "clip_ratio/low_mean": 0.0009382779990119161, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002183126227464527, + "epoch": 1.9798775153105863, + "grad_norm": 0.2377062439918518, + "learning_rate": 1e-06, + "loss": -0.104, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0023006051560514607, + "clip_ratio/high_mean": 0.0007807049223629292, + "clip_ratio/low_mean": 0.0006758223921679019, + "clip_ratio/low_min": 1.125720427808119e-05, + "clip_ratio/region_mean": 0.0014565272940672003, + "completions/clipped_ratio": 0.1540178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3104.0, + "completions/mean_length": 1125.243408203125, + "completions/mean_terminated_length": 584.3931884765625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 1.9822105570137065, + "grad_norm": 0.3098432719707489, + "learning_rate": 1e-06, + "loss": -0.0745, + "num_tokens": 125716956.0, + "reward": 0.5602678656578064, + "reward_std": 0.1442854106426239, + "rewards/verify_math_reward/mean": 0.5602678656578064, + "rewards/verify_math_reward/std": 0.4966317415237427, + "step": 849 + }, + { + "clip_ratio/high_max": 0.002714711728913244, + "clip_ratio/high_mean": 0.0009240049912477843, + "clip_ratio/low_mean": 0.0010237854571641947, + "clip_ratio/low_min": 1.125720427808119e-05, + "clip_ratio/region_mean": 0.001947790471604094, + "epoch": 1.9845435987168272, + "grad_norm": 0.3019099533557892, + "learning_rate": 1e-06, + "loss": -0.0748, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0033190110116265714, + "clip_ratio/high_mean": 0.0011018242767022457, + "clip_ratio/low_mean": 0.0011715921486938896, + "clip_ratio/low_min": 2.471332481945865e-05, + "clip_ratio/region_mean": 0.0022734164231223986, + "epoch": 1.9868766404199474, + "grad_norm": 0.26513129472732544, + "learning_rate": 1e-06, + "loss": -0.075, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0026627587794791907, + "clip_ratio/high_mean": 0.000974896540355985, + "clip_ratio/low_mean": 0.0013590649268735433, + "clip_ratio/low_min": 1.125720427808119e-05, + "clip_ratio/region_mean": 0.0023339615072472952, + "epoch": 1.989209682123068, + "grad_norm": 0.2309122234582901, + "learning_rate": 1e-06, + "loss": -0.075, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0028166246265755035, + "clip_ratio/high_mean": 0.0010274419109919108, + "clip_ratio/low_mean": 0.0005627615837511257, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015902034610917326, + "completions/clipped_ratio": 0.1607142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3683.0, + "completions/mean_length": 1177.8560791015625, + "completions/mean_terminated_length": 619.0625, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 1.9915427238261882, + "grad_norm": 0.35362178087234497, + "learning_rate": 1e-06, + "loss": -0.0613, + "num_tokens": 126266563.0, + "reward": 0.606026828289032, + "reward_std": 0.14508774876594543, + "rewards/verify_math_reward/mean": 0.6060267686843872, + "rewards/verify_math_reward/std": 0.48890194296836853, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0031189397268462926, + "clip_ratio/high_mean": 0.0012358821859379532, + "clip_ratio/low_mean": 0.0008159829521900974, + "clip_ratio/low_min": 3.2946758437901735e-05, + "clip_ratio/region_mean": 0.0020518651654128917, + "epoch": 1.993875765529309, + "grad_norm": 0.2607204020023346, + "learning_rate": 1e-06, + "loss": -0.0616, + "step": 854 + }, + { + "clip_ratio/high_max": 0.003127547497570049, + "clip_ratio/high_mean": 0.001196386856463505, + "clip_ratio/low_mean": 0.000987397224889719, + "clip_ratio/low_min": 4.9498416046844795e-05, + "clip_ratio/region_mean": 0.0021837840831722133, + "epoch": 1.9962088072324293, + "grad_norm": 0.23352815210819244, + "learning_rate": 1e-06, + "loss": -0.0618, + "step": 855 + }, + { + "clip_ratio/high_max": 0.003170761585352011, + "clip_ratio/high_mean": 0.0011412914191168966, + "clip_ratio/low_mean": 0.0012092185934307054, + "clip_ratio/low_min": 4.9498416046844795e-05, + "clip_ratio/region_mean": 0.0023505100107286125, + "epoch": 1.9985418489355498, + "grad_norm": 0.23547892272472382, + "learning_rate": 1e-06, + "loss": -0.0619, + "step": 856 + }, + { + "clip_ratio/high_max": 0.003047192654776154, + "clip_ratio/high_mean": 0.0011949555737373885, + "clip_ratio/low_mean": 0.0007931789368740283, + "clip_ratio/low_min": 1.1322464160912205e-05, + "clip_ratio/region_mean": 0.0019881345360772684, + "completions/clipped_ratio": 0.1551339285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2954.0, + "completions/mean_length": 1159.671875, + "completions/mean_terminated_length": 620.504638671875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 2.0023330417031207, + "grad_norm": 0.4390386939048767, + "learning_rate": 1e-06, + "loss": -0.0591, + "num_tokens": 126824773.0, + "reward": 0.5714285969734192, + "reward_std": 0.17874936759471893, + "rewards/verify_math_reward/mean": 0.5714285969734192, + "rewards/verify_math_reward/std": 0.49514803290367126, + "step": 857 + }, + { + "clip_ratio/high_max": 0.003988200696767308, + "clip_ratio/high_mean": 0.0014030033053131774, + "clip_ratio/low_mean": 0.0011837988859042525, + "clip_ratio/low_min": 5.661231989506632e-05, + "clip_ratio/region_mean": 0.002586802133009769, + "epoch": 2.004666083406241, + "grad_norm": 0.2932032644748688, + "learning_rate": 1e-06, + "loss": -0.0593, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0027904905364266597, + "clip_ratio/high_mean": 0.0012420199327607406, + "clip_ratio/low_mean": 0.0014423892898776103, + "clip_ratio/low_min": 1.1322464160912205e-05, + "clip_ratio/region_mean": 0.002684409155335743, + "epoch": 2.0069991251093615, + "grad_norm": 0.2708625793457031, + "learning_rate": 1e-06, + "loss": -0.0596, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0039043327851686627, + "clip_ratio/high_mean": 0.0014121150343271438, + "clip_ratio/low_mean": 0.0015946616549626924, + "clip_ratio/low_min": 4.528985664364882e-05, + "clip_ratio/region_mean": 0.0030067766856518574, + "epoch": 2.0093321668124817, + "grad_norm": 0.2647579312324524, + "learning_rate": 1e-06, + "loss": -0.0597, + "step": 860 + }, + { + "clip_ratio/high_max": 0.002119186057825573, + "clip_ratio/high_mean": 0.0007913218196335947, + "clip_ratio/low_mean": 0.0004452995322026254, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012366213377390523, + "completions/clipped_ratio": 0.1506696428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2324.0, + "completions/mean_length": 1149.997802734375, + "completions/mean_terminated_length": 627.3823852539062, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 2.0116652085156024, + "grad_norm": 0.3910108208656311, + "learning_rate": 1e-06, + "loss": -0.0548, + "num_tokens": 127391891.0, + "reward": 0.590401828289032, + "reward_std": 0.14556488394737244, + "rewards/verify_math_reward/mean": 0.5904017686843872, + "rewards/verify_math_reward/std": 0.49203425645828247, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0025455272843828425, + "clip_ratio/high_mean": 0.0009815586654440267, + "clip_ratio/low_mean": 0.0006933992754056817, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016749579808674753, + "epoch": 2.0139982502187226, + "grad_norm": 0.19863538444042206, + "learning_rate": 1e-06, + "loss": -0.0551, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0025782149605220184, + "clip_ratio/high_mean": 0.000941302307182923, + "clip_ratio/low_mean": 0.000803361215730547, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001744663510180544, + "epoch": 2.0163312919218432, + "grad_norm": 0.2755461037158966, + "learning_rate": 1e-06, + "loss": -0.0552, + "step": 863 + }, + { + "clip_ratio/high_max": 0.002186407222325215, + "clip_ratio/high_mean": 0.0008494528065057239, + "clip_ratio/low_mean": 0.000943040229685721, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017924930216395296, + "epoch": 2.0186643336249634, + "grad_norm": 0.2356482893228531, + "learning_rate": 1e-06, + "loss": -0.0552, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0025537680339766666, + "clip_ratio/high_mean": 0.0010160112324228976, + "clip_ratio/low_mean": 0.0005593823034359957, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015753935804241337, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2578.0, + "completions/mean_length": 1229.4866943359375, + "completions/mean_terminated_length": 634.5498657226562, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 2.020997375328084, + "grad_norm": 0.3333074152469635, + "learning_rate": 1e-06, + "loss": -0.0824, + "num_tokens": 127956423.0, + "reward": 0.5524553656578064, + "reward_std": 0.16424313187599182, + "rewards/verify_math_reward/mean": 0.5524553656578064, + "rewards/verify_math_reward/std": 0.49751853942871094, + "step": 865 + }, + { + "clip_ratio/high_max": 0.002955421070510056, + "clip_ratio/high_mean": 0.0011224296758882701, + "clip_ratio/low_mean": 0.0008748107065912336, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001997240397031419, + "epoch": 2.0233304170312043, + "grad_norm": 0.25484535098075867, + "learning_rate": 1e-06, + "loss": -0.0827, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0030243089931900613, + "clip_ratio/high_mean": 0.0012547181722766254, + "clip_ratio/low_mean": 0.0010132673451153096, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022679855319438502, + "epoch": 2.025663458734325, + "grad_norm": 0.282296746969223, + "learning_rate": 1e-06, + "loss": -0.0829, + "step": 867 + }, + { + "clip_ratio/high_max": 0.002858953630493488, + "clip_ratio/high_mean": 0.0011380158939573448, + "clip_ratio/low_mean": 0.001099641729524592, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022376576234819368, + "epoch": 2.027996500437445, + "grad_norm": 0.22792915999889374, + "learning_rate": 1e-06, + "loss": -0.083, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0023803658405086026, + "clip_ratio/high_mean": 0.0008531928397133015, + "clip_ratio/low_mean": 0.0007015458722889889, + "clip_ratio/low_min": 2.8506270609796047e-05, + "clip_ratio/region_mean": 0.0015547387047263328, + "completions/clipped_ratio": 0.1707589285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3985.0, + "completions/mean_length": 1315.3851318359375, + "completions/mean_terminated_length": 742.79541015625, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 2.030329542140566, + "grad_norm": 0.3597649931907654, + "learning_rate": 1e-06, + "loss": -0.066, + "num_tokens": 128599632.0, + "reward": 0.5055803656578064, + "reward_std": 0.1872745305299759, + "rewards/verify_math_reward/mean": 0.5055803656578064, + "rewards/verify_math_reward/std": 0.5002480745315552, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0026170422934228554, + "clip_ratio/high_mean": 0.000998484163574176, + "clip_ratio/low_mean": 0.000972096813711687, + "clip_ratio/low_min": 3.850102802971378e-05, + "clip_ratio/region_mean": 0.0019705809900187887, + "epoch": 2.032662583843686, + "grad_norm": 0.2861005365848541, + "learning_rate": 1e-06, + "loss": -0.0662, + "step": 870 + }, + { + "clip_ratio/high_max": 0.003180027582857292, + "clip_ratio/high_mean": 0.0011111904823337682, + "clip_ratio/low_mean": 0.0011452123708295403, + "clip_ratio/low_min": 3.850102802971378e-05, + "clip_ratio/region_mean": 0.0022564028768101707, + "epoch": 2.0349956255468067, + "grad_norm": 0.2486860156059265, + "learning_rate": 1e-06, + "loss": -0.0664, + "step": 871 + }, + { + "clip_ratio/high_max": 0.00269731388107175, + "clip_ratio/high_mean": 0.001022971280690399, + "clip_ratio/low_mean": 0.0013633073940582108, + "clip_ratio/low_min": 2.5667352019809186e-05, + "clip_ratio/region_mean": 0.0023862786110839806, + "epoch": 2.037328667249927, + "grad_norm": 0.36107203364372253, + "learning_rate": 1e-06, + "loss": -0.0665, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0027890367164218333, + "clip_ratio/high_mean": 0.001022377216941095, + "clip_ratio/low_mean": 0.000612402500337339, + "clip_ratio/low_min": 1.7193948224303313e-05, + "clip_ratio/region_mean": 0.001634779735468328, + "completions/clipped_ratio": 0.1238839285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3322.0, + "completions/mean_length": 1023.1563110351562, + "completions/mean_terminated_length": 588.6522216796875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 2.0396617089530475, + "grad_norm": 0.338905930519104, + "learning_rate": 1e-06, + "loss": -0.0368, + "num_tokens": 129149068.0, + "reward": 0.6517857313156128, + "reward_std": 0.17029374837875366, + "rewards/verify_math_reward/mean": 0.6517857313156128, + "rewards/verify_math_reward/std": 0.47667041420936584, + "step": 873 + }, + { + "clip_ratio/high_max": 0.003292863482784014, + "clip_ratio/high_mean": 0.0011018429668183671, + "clip_ratio/low_mean": 0.0008577700646128505, + "clip_ratio/low_min": 1.691932811809238e-05, + "clip_ratio/region_mean": 0.001959613015060313, + "epoch": 2.041994750656168, + "grad_norm": 0.2677460312843323, + "learning_rate": 1e-06, + "loss": -0.037, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0032929509761743248, + "clip_ratio/high_mean": 0.001230530640896177, + "clip_ratio/low_mean": 0.0010246034125884762, + "clip_ratio/low_min": 5.4959549743216485e-05, + "clip_ratio/region_mean": 0.0022551340371137485, + "epoch": 2.0443277923592884, + "grad_norm": 0.23969261348247528, + "learning_rate": 1e-06, + "loss": -0.0372, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0036424016871023923, + "clip_ratio/high_mean": 0.0012846665631514043, + "clip_ratio/low_mean": 0.0012816219095839188, + "clip_ratio/low_min": 6.595146260224283e-05, + "clip_ratio/region_mean": 0.002566288509115111, + "epoch": 2.046660834062409, + "grad_norm": 0.22869691252708435, + "learning_rate": 1e-06, + "loss": -0.0373, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0020634450411307625, + "clip_ratio/high_mean": 0.0008783011871855706, + "clip_ratio/low_mean": 0.0007541520153608872, + "clip_ratio/low_min": 1.429551684850594e-05, + "clip_ratio/region_mean": 0.001632453189813532, + "completions/clipped_ratio": 0.2142857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2833.0, + "completions/mean_length": 1407.37841796875, + "completions/mean_terminated_length": 674.117919921875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 2.0489938757655293, + "grad_norm": 0.4166792631149292, + "learning_rate": 1e-06, + "loss": -0.0569, + "num_tokens": 129718183.0, + "reward": 0.5189732313156128, + "reward_std": 0.18329225480556488, + "rewards/verify_math_reward/mean": 0.5189732313156128, + "rewards/verify_math_reward/std": 0.49991893768310547, + "step": 877 + }, + { + "clip_ratio/high_max": 0.002453501168929506, + "clip_ratio/high_mean": 0.0010702426134230336, + "clip_ratio/low_mean": 0.0010067290804727236, + "clip_ratio/low_min": 1.429551684850594e-05, + "clip_ratio/region_mean": 0.0020769717302755453, + "epoch": 2.05132691746865, + "grad_norm": 0.24961958825588226, + "learning_rate": 1e-06, + "loss": -0.0571, + "step": 878 + }, + { + "clip_ratio/high_max": 0.002625891262141522, + "clip_ratio/high_mean": 0.0011261152212682646, + "clip_ratio/low_mean": 0.001208353744004853, + "clip_ratio/low_min": 6.284304254222661e-05, + "clip_ratio/region_mean": 0.0023344689834630117, + "epoch": 2.05365995917177, + "grad_norm": 0.23191159963607788, + "learning_rate": 1e-06, + "loss": -0.0573, + "step": 879 + }, + { + "clip_ratio/high_max": 0.002479023845808115, + "clip_ratio/high_mean": 0.0010229359704680974, + "clip_ratio/low_mean": 0.0014079654647503048, + "clip_ratio/low_min": 4.4445929233916104e-05, + "clip_ratio/region_mean": 0.002430901469779201, + "epoch": 2.055993000874891, + "grad_norm": 0.21616202592849731, + "learning_rate": 1e-06, + "loss": -0.0574, + "step": 880 + }, + { + "clip_ratio/high_max": 0.002158216477255337, + "clip_ratio/high_mean": 0.0007028610289125936, + "clip_ratio/low_mean": 0.000718300512744463, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014211615598469507, + "completions/clipped_ratio": 0.1495535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3887.0, + "completions/mean_length": 1123.07373046875, + "completions/mean_terminated_length": 600.2755737304688, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 2.058326042578011, + "grad_norm": 0.2821815311908722, + "learning_rate": 1e-06, + "loss": -0.0429, + "num_tokens": 130263505.0, + "reward": 0.5837053656578064, + "reward_std": 0.1345210075378418, + "rewards/verify_math_reward/mean": 0.5837053656578064, + "rewards/verify_math_reward/std": 0.49321892857551575, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0029855417924409267, + "clip_ratio/high_mean": 0.0009691047216620063, + "clip_ratio/low_mean": 0.0008622468667454086, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018313515683985315, + "epoch": 2.0606590842811316, + "grad_norm": 0.2728791832923889, + "learning_rate": 1e-06, + "loss": -0.0432, + "step": 882 + }, + { + "clip_ratio/high_max": 0.002525583593524061, + "clip_ratio/high_mean": 0.0008816239460429642, + "clip_ratio/low_mean": 0.0010311347541573923, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019127586492686532, + "epoch": 2.062992125984252, + "grad_norm": 0.25536054372787476, + "learning_rate": 1e-06, + "loss": -0.0433, + "step": 883 + }, + { + "clip_ratio/high_max": 0.0026348084647906944, + "clip_ratio/high_mean": 0.0008546148319510394, + "clip_ratio/low_mean": 0.001253188143891748, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002107803011313081, + "epoch": 2.0653251676873725, + "grad_norm": 0.21684107184410095, + "learning_rate": 1e-06, + "loss": -0.0434, + "step": 884 + }, + { + "clip_ratio/high_max": 0.002526842705265153, + "clip_ratio/high_mean": 0.0008572160531912232, + "clip_ratio/low_mean": 0.00048245869129459606, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013396747344813775, + "completions/clipped_ratio": 0.1707589285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3284.0, + "completions/mean_length": 1280.11279296875, + "completions/mean_terminated_length": 700.259765625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 2.0676582093904927, + "grad_norm": 0.23400527238845825, + "learning_rate": 1e-06, + "loss": -0.0832, + "num_tokens": 130875446.0, + "reward": 0.5580357313156128, + "reward_std": 0.16235631704330444, + "rewards/verify_math_reward/mean": 0.5580357313156128, + "rewards/verify_math_reward/std": 0.49689778685569763, + "step": 885 + }, + { + "clip_ratio/high_max": 0.002830396333592944, + "clip_ratio/high_mean": 0.001031815798341995, + "clip_ratio/low_mean": 0.0005952802539468394, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016270960513793398, + "epoch": 2.0699912510936134, + "grad_norm": 0.209345743060112, + "learning_rate": 1e-06, + "loss": -0.0833, + "step": 886 + }, + { + "clip_ratio/high_max": 0.003029807143320795, + "clip_ratio/high_mean": 0.0010907083978963783, + "clip_ratio/low_mean": 0.0007710019472142449, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001861710330558708, + "epoch": 2.0723242927967336, + "grad_norm": 0.26701340079307556, + "learning_rate": 1e-06, + "loss": -0.0834, + "step": 887 + }, + { + "clip_ratio/high_max": 0.0031339623310486786, + "clip_ratio/high_mean": 0.0010413353120384272, + "clip_ratio/low_mean": 0.0008892381974874297, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001930573518620804, + "epoch": 2.0746573344998542, + "grad_norm": 0.18892860412597656, + "learning_rate": 1e-06, + "loss": -0.0835, + "step": 888 + }, + { + "clip_ratio/high_max": 0.002224198036856251, + "clip_ratio/high_mean": 0.0007760922162560746, + "clip_ratio/low_mean": 0.0005144084084349743, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012905006024084287, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3943.0, + "completions/mean_length": 1177.1707763671875, + "completions/mean_terminated_length": 636.6467895507812, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 2.0769903762029744, + "grad_norm": 0.3154468238353729, + "learning_rate": 1e-06, + "loss": -0.0547, + "num_tokens": 131457871.0, + "reward": 0.6071428656578064, + "reward_std": 0.13139888644218445, + "rewards/verify_math_reward/mean": 0.6071428656578064, + "rewards/verify_math_reward/std": 0.48865827918052673, + "step": 889 + }, + { + "clip_ratio/high_max": 0.002584983390988782, + "clip_ratio/high_mean": 0.0009189535740006249, + "clip_ratio/low_mean": 0.0007904853746367735, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017094390022975858, + "epoch": 2.079323417906095, + "grad_norm": 0.23959881067276, + "learning_rate": 1e-06, + "loss": -0.0549, + "step": 890 + }, + { + "clip_ratio/high_max": 0.00268344135110965, + "clip_ratio/high_mean": 0.0008809746614133473, + "clip_ratio/low_mean": 0.0009276802393287653, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018086548880091868, + "epoch": 2.0816564596092153, + "grad_norm": 0.20634399354457855, + "learning_rate": 1e-06, + "loss": -0.055, + "step": 891 + }, + { + "clip_ratio/high_max": 0.002773980326310266, + "clip_ratio/high_mean": 0.0009158890570688527, + "clip_ratio/low_mean": 0.0010924457674263977, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002008334849961102, + "epoch": 2.083989501312336, + "grad_norm": 0.1906619518995285, + "learning_rate": 1e-06, + "loss": -0.0551, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0020683441616711207, + "clip_ratio/high_mean": 0.0007411143869830994, + "clip_ratio/low_mean": 0.0006184447379382618, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013595591444754973, + "completions/clipped_ratio": 0.1517857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3906.0, + "completions/mean_length": 1174.2410888671875, + "completions/mean_terminated_length": 651.4000244140625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.0863225430154566, + "grad_norm": 0.3441523015499115, + "learning_rate": 1e-06, + "loss": -0.0756, + "num_tokens": 132038639.0, + "reward": 0.6194196939468384, + "reward_std": 0.13444431126117706, + "rewards/verify_math_reward/mean": 0.6194196343421936, + "rewards/verify_math_reward/std": 0.48580074310302734, + "step": 893 + }, + { + "clip_ratio/high_max": 0.002379316763835959, + "clip_ratio/high_mean": 0.0009014450861286605, + "clip_ratio/low_mean": 0.0008722010829842475, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017736461850290652, + "epoch": 2.088655584718577, + "grad_norm": 0.2408902794122696, + "learning_rate": 1e-06, + "loss": -0.0758, + "step": 894 + }, + { + "clip_ratio/high_max": 0.0024297764684888534, + "clip_ratio/high_mean": 0.0009308865046477877, + "clip_ratio/low_mean": 0.0010580024304545077, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001988888980122283, + "epoch": 2.0909886264216975, + "grad_norm": 0.2293797731399536, + "learning_rate": 1e-06, + "loss": -0.076, + "step": 895 + }, + { + "clip_ratio/high_max": 0.002157798364351038, + "clip_ratio/high_mean": 0.0008320046144945081, + "clip_ratio/low_mean": 0.0011370203883416252, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001969025001017144, + "epoch": 2.0933216681248177, + "grad_norm": 0.1965988725423813, + "learning_rate": 1e-06, + "loss": -0.0761, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0025361771185998805, + "clip_ratio/high_mean": 0.000843985199026065, + "clip_ratio/low_mean": 0.0005831091912114061, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014270944047893863, + "completions/clipped_ratio": 0.1651785714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4039.0, + "completions/mean_length": 1246.141845703125, + "completions/mean_terminated_length": 682.2660522460938, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 2.0956547098279383, + "grad_norm": 0.29047125577926636, + "learning_rate": 1e-06, + "loss": -0.0672, + "num_tokens": 132647382.0, + "reward": 0.5290178656578064, + "reward_std": 0.15702247619628906, + "rewards/verify_math_reward/mean": 0.5290178656578064, + "rewards/verify_math_reward/std": 0.49943605065345764, + "step": 897 + }, + { + "clip_ratio/high_max": 0.003130072793283034, + "clip_ratio/high_mean": 0.0011506956107041333, + "clip_ratio/low_mean": 0.0007367316311501781, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018874272573157214, + "epoch": 2.0979877515310585, + "grad_norm": 0.27660995721817017, + "learning_rate": 1e-06, + "loss": -0.0674, + "step": 898 + }, + { + "clip_ratio/high_max": 0.0028732167556881905, + "clip_ratio/high_mean": 0.0011284256470389664, + "clip_ratio/low_mean": 0.000960379215030116, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020888048675260507, + "epoch": 2.100320793234179, + "grad_norm": 0.20674245059490204, + "learning_rate": 1e-06, + "loss": -0.0676, + "step": 899 + }, + { + "clip_ratio/high_max": 0.0031517503448412754, + "clip_ratio/high_mean": 0.0010357327191741206, + "clip_ratio/low_mean": 0.0009695559638203122, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00200528866116656, + "epoch": 2.1026538349372994, + "grad_norm": 0.2387486696243286, + "learning_rate": 1e-06, + "loss": -0.0676, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0027165291612618603, + "clip_ratio/high_mean": 0.0008888476822903613, + "clip_ratio/low_mean": 0.0005623961233141017, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014512437846860848, + "completions/clipped_ratio": 0.1886160714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3551.0, + "completions/mean_length": 1303.485595703125, + "completions/mean_terminated_length": 654.3314819335938, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 2.10498687664042, + "grad_norm": 0.33062219619750977, + "learning_rate": 1e-06, + "loss": -0.0628, + "num_tokens": 133214929.0, + "reward": 0.5524553656578064, + "reward_std": 0.15578144788742065, + "rewards/verify_math_reward/mean": 0.5524553656578064, + "rewards/verify_math_reward/std": 0.49751853942871094, + "step": 901 + }, + { + "clip_ratio/high_max": 0.00301307208428625, + "clip_ratio/high_mean": 0.0011444835472502746, + "clip_ratio/low_mean": 0.0009213571383952512, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020658407011069357, + "epoch": 2.1073199183435403, + "grad_norm": 0.2832688093185425, + "learning_rate": 1e-06, + "loss": -0.0631, + "step": 902 + }, + { + "clip_ratio/high_max": 0.0031409784132847562, + "clip_ratio/high_mean": 0.0011654591144178994, + "clip_ratio/low_mean": 0.0010380928288213909, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022035519432392903, + "epoch": 2.109652960046661, + "grad_norm": 0.2659349739551544, + "learning_rate": 1e-06, + "loss": -0.0633, + "step": 903 + }, + { + "clip_ratio/high_max": 0.003178455473971553, + "clip_ratio/high_mean": 0.001116634884965606, + "clip_ratio/low_mean": 0.0013046228841631091, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024212577409343794, + "epoch": 2.111986001749781, + "grad_norm": 0.23541253805160522, + "learning_rate": 1e-06, + "loss": -0.0633, + "step": 904 + }, + { + "clip_ratio/high_max": 0.002693645998078864, + "clip_ratio/high_mean": 0.001170849511254346, + "clip_ratio/low_mean": 0.000556333008717047, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017271825417992659, + "completions/clipped_ratio": 0.1819196428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2707.0, + "completions/mean_length": 1236.67529296875, + "completions/mean_terminated_length": 600.8363037109375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 2.114319043452902, + "grad_norm": 0.6479451656341553, + "learning_rate": 1e-06, + "loss": -0.1152, + "num_tokens": 133738094.0, + "reward": 0.590401828289032, + "reward_std": 0.18558135628700256, + "rewards/verify_math_reward/mean": 0.5904017686843872, + "rewards/verify_math_reward/std": 0.49203425645828247, + "step": 905 + }, + { + "clip_ratio/high_max": 0.003487138521450106, + "clip_ratio/high_mean": 0.0013652735142386518, + "clip_ratio/low_mean": 0.0008751920686336234, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022404655610444024, + "epoch": 2.116652085156022, + "grad_norm": 0.3784768581390381, + "learning_rate": 1e-06, + "loss": -0.1155, + "step": 906 + }, + { + "clip_ratio/high_max": 0.0031059396715136245, + "clip_ratio/high_mean": 0.0013185599891585298, + "clip_ratio/low_mean": 0.0009855562639131676, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023041162858135067, + "epoch": 2.1189851268591426, + "grad_norm": 0.37493959069252014, + "learning_rate": 1e-06, + "loss": -0.1157, + "step": 907 + }, + { + "clip_ratio/high_max": 0.0032188539116759785, + "clip_ratio/high_mean": 0.0013216879997344222, + "clip_ratio/low_mean": 0.0012801653465430718, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026018533026217483, + "epoch": 2.121318168562263, + "grad_norm": 0.23647190630435944, + "learning_rate": 1e-06, + "loss": -0.1159, + "step": 908 + }, + { + "clip_ratio/high_max": 0.00239591280114837, + "clip_ratio/high_mean": 0.000833051448353217, + "clip_ratio/low_mean": 0.0005077994796920393, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001340850900305668, + "completions/clipped_ratio": 0.1640625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2666.0, + "completions/mean_length": 1196.2957763671875, + "completions/mean_terminated_length": 627.1949462890625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 2.1236512102653835, + "grad_norm": 0.31013181805610657, + "learning_rate": 1e-06, + "loss": -0.0991, + "num_tokens": 134296271.0, + "reward": 0.609375, + "reward_std": 0.15323200821876526, + "rewards/verify_math_reward/mean": 0.609375, + "rewards/verify_math_reward/std": 0.48816296458244324, + "step": 909 + }, + { + "clip_ratio/high_max": 0.003133546299068257, + "clip_ratio/high_mean": 0.0010264157244819216, + "clip_ratio/low_mean": 0.000651560700475784, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001677976415521698, + "epoch": 2.1259842519685037, + "grad_norm": 0.2695416808128357, + "learning_rate": 1e-06, + "loss": -0.0993, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0030468583208858036, + "clip_ratio/high_mean": 0.001055873934092233, + "clip_ratio/low_mean": 0.0008514502305843052, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019073241783189587, + "epoch": 2.1283172936716244, + "grad_norm": 0.22806468605995178, + "learning_rate": 1e-06, + "loss": -0.0995, + "step": 911 + }, + { + "clip_ratio/high_max": 0.0028933032081113197, + "clip_ratio/high_mean": 0.0009494455625826959, + "clip_ratio/low_mean": 0.001000595917503233, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001950041449163109, + "epoch": 2.130650335374745, + "grad_norm": 0.21979136765003204, + "learning_rate": 1e-06, + "loss": -0.0996, + "step": 912 + }, + { + "clip_ratio/high_max": 0.003457512670138385, + "clip_ratio/high_mean": 0.0012429355201675207, + "clip_ratio/low_mean": 0.0006668744454145781, + "clip_ratio/low_min": 2.9719449230469763e-05, + "clip_ratio/region_mean": 0.001909809943754226, + "completions/clipped_ratio": 0.1785714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3929.0, + "completions/mean_length": 1314.5301513671875, + "completions/mean_terminated_length": 709.86279296875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 2.1329833770778652, + "grad_norm": 0.4042540192604065, + "learning_rate": 1e-06, + "loss": -0.054, + "num_tokens": 134913666.0, + "reward": 0.5290178656578064, + "reward_std": 0.18719714879989624, + "rewards/verify_math_reward/mean": 0.5290178656578064, + "rewards/verify_math_reward/std": 0.49943605065345764, + "step": 913 + }, + { + "clip_ratio/high_max": 0.003955599411710864, + "clip_ratio/high_mean": 0.0015156144054344622, + "clip_ratio/low_mean": 0.0010538833803366288, + "clip_ratio/low_min": 7.302063113456825e-05, + "clip_ratio/region_mean": 0.002569497715739999, + "epoch": 2.135316418780986, + "grad_norm": 0.339517742395401, + "learning_rate": 1e-06, + "loss": -0.0543, + "step": 914 + }, + { + "clip_ratio/high_max": 0.0038658706289425027, + "clip_ratio/high_mean": 0.001414357790054055, + "clip_ratio/low_mean": 0.001270181384825264, + "clip_ratio/low_min": 8.306038125738269e-05, + "clip_ratio/region_mean": 0.002684539220354054, + "epoch": 2.137649460484106, + "grad_norm": 0.300604909658432, + "learning_rate": 1e-06, + "loss": -0.0545, + "step": 915 + }, + { + "clip_ratio/high_max": 0.003662280912976712, + "clip_ratio/high_mean": 0.001388209870128776, + "clip_ratio/low_mean": 0.0015070848130562808, + "clip_ratio/low_min": 0.00012288177094887942, + "clip_ratio/region_mean": 0.0028952946086064912, + "epoch": 2.1399825021872267, + "grad_norm": 0.24068570137023926, + "learning_rate": 1e-06, + "loss": -0.0546, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0025139247190963943, + "clip_ratio/high_mean": 0.0007882582540332805, + "clip_ratio/low_mean": 0.00047680701118224533, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012650652824959252, + "completions/clipped_ratio": 0.1350446428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3895.0, + "completions/mean_length": 1088.91748046875, + "completions/mean_terminated_length": 619.424560546875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 2.142315543890347, + "grad_norm": 0.2798072099685669, + "learning_rate": 1e-06, + "loss": -0.0857, + "num_tokens": 135479312.0, + "reward": 0.6082589626312256, + "reward_std": 0.14255832135677338, + "rewards/verify_math_reward/mean": 0.6082589030265808, + "rewards/verify_math_reward/std": 0.48841193318367004, + "step": 917 + }, + { + "clip_ratio/high_max": 0.0024859030891093425, + "clip_ratio/high_mean": 0.0008973686544777593, + "clip_ratio/low_mean": 0.0007205899273685645, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016179585763893556, + "epoch": 2.1446485855934676, + "grad_norm": 0.2337394803762436, + "learning_rate": 1e-06, + "loss": -0.0859, + "step": 918 + }, + { + "clip_ratio/high_max": 0.0025061886408366263, + "clip_ratio/high_mean": 0.0008488409621350002, + "clip_ratio/low_mean": 0.0008249815755334566, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016738224949222058, + "epoch": 2.146981627296588, + "grad_norm": 0.248878613114357, + "learning_rate": 1e-06, + "loss": -0.086, + "step": 919 + }, + { + "clip_ratio/high_max": 0.002762799762422219, + "clip_ratio/high_mean": 0.0009419840571354143, + "clip_ratio/low_mean": 0.0009382011185152805, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018801851401804015, + "epoch": 2.1493146689997085, + "grad_norm": 0.22181767225265503, + "learning_rate": 1e-06, + "loss": -0.086, + "step": 920 + }, + { + "clip_ratio/high_max": 0.002830226774676703, + "clip_ratio/high_mean": 0.0008858105793478899, + "clip_ratio/low_mean": 0.0006732423416906386, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001559052907396108, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3833.0, + "completions/mean_length": 1169.8504638671875, + "completions/mean_terminated_length": 627.9708862304688, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 2.1516477107028287, + "grad_norm": 0.36873382329940796, + "learning_rate": 1e-06, + "loss": -0.0428, + "num_tokens": 136043794.0, + "reward": 0.551339328289032, + "reward_std": 0.14327649772167206, + "rewards/verify_math_reward/mean": 0.5513392686843872, + "rewards/verify_math_reward/std": 0.4976350665092468, + "step": 921 + }, + { + "clip_ratio/high_max": 0.0030737649358343333, + "clip_ratio/high_mean": 0.0010548260834184475, + "clip_ratio/low_mean": 0.0008977624897852365, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019525885509210639, + "epoch": 2.1539807524059493, + "grad_norm": 0.31384843587875366, + "learning_rate": 1e-06, + "loss": -0.0432, + "step": 922 + }, + { + "clip_ratio/high_max": 0.002901628045947291, + "clip_ratio/high_mean": 0.0010035861105279764, + "clip_ratio/low_mean": 0.0010312153040104022, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002034801429545041, + "epoch": 2.1563137941090695, + "grad_norm": 0.287413090467453, + "learning_rate": 1e-06, + "loss": -0.0433, + "step": 923 + }, + { + "clip_ratio/high_max": 0.0030378203809959814, + "clip_ratio/high_mean": 0.0010518749622860923, + "clip_ratio/low_mean": 0.001193243349007389, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002245118303108029, + "epoch": 2.15864683581219, + "grad_norm": 0.2901656925678253, + "learning_rate": 1e-06, + "loss": -0.0434, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0021673360315617174, + "clip_ratio/high_mean": 0.0007874278126109857, + "clip_ratio/low_mean": 0.00045988218789716484, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012473100287024863, + "completions/clipped_ratio": 0.1662946428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3983.0, + "completions/mean_length": 1261.01904296875, + "completions/mean_terminated_length": 695.5408325195312, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 2.1609798775153104, + "grad_norm": 0.24593819677829742, + "learning_rate": 1e-06, + "loss": -0.0537, + "num_tokens": 136653739.0, + "reward": 0.5390625, + "reward_std": 0.13880637288093567, + "rewards/verify_math_reward/mean": 0.5390625, + "rewards/verify_math_reward/std": 0.4987502098083496, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0024482507360517047, + "clip_ratio/high_mean": 0.0008538253787264694, + "clip_ratio/low_mean": 0.0006132278729182872, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014670532691525295, + "epoch": 2.163312919218431, + "grad_norm": 0.2377663403749466, + "learning_rate": 1e-06, + "loss": -0.0538, + "step": 926 + }, + { + "clip_ratio/high_max": 0.0023770478946971707, + "clip_ratio/high_mean": 0.0008966462391981622, + "clip_ratio/low_mean": 0.0008122432655000011, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001708889496512711, + "epoch": 2.1656459609215517, + "grad_norm": 0.27022865414619446, + "learning_rate": 1e-06, + "loss": -0.054, + "step": 927 + }, + { + "clip_ratio/high_max": 0.002444057034153957, + "clip_ratio/high_mean": 0.0008821404062473448, + "clip_ratio/low_mean": 0.000945021546613134, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00182716193376109, + "epoch": 2.167979002624672, + "grad_norm": 0.25843802094459534, + "learning_rate": 1e-06, + "loss": -0.0541, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0021551562094828114, + "clip_ratio/high_mean": 0.0008665754467074294, + "clip_ratio/low_mean": 0.0007805696077411994, + "clip_ratio/low_min": 6.278415912674973e-05, + "clip_ratio/region_mean": 0.0016471450726385228, + "completions/clipped_ratio": 0.1350446428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3538.0, + "completions/mean_length": 1148.51904296875, + "completions/mean_terminated_length": 688.3316650390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 2.1703120443277926, + "grad_norm": 0.3312126696109772, + "learning_rate": 1e-06, + "loss": -0.0379, + "num_tokens": 137278460.0, + "reward": 0.5167410969734192, + "reward_std": 0.17908243834972382, + "rewards/verify_math_reward/mean": 0.5167410969734192, + "rewards/verify_math_reward/std": 0.4999987483024597, + "step": 929 + }, + { + "clip_ratio/high_max": 0.0025835675405687653, + "clip_ratio/high_mean": 0.001081595979485428, + "clip_ratio/low_mean": 0.0010114546475961106, + "clip_ratio/low_min": 0.00011477376574475784, + "clip_ratio/region_mean": 0.002093050668918295, + "epoch": 2.1726450860309128, + "grad_norm": 0.2656216621398926, + "learning_rate": 1e-06, + "loss": -0.0381, + "step": 930 + }, + { + "clip_ratio/high_max": 0.00256100272963522, + "clip_ratio/high_mean": 0.0010460949542903109, + "clip_ratio/low_mean": 0.0011754417610063683, + "clip_ratio/low_min": 0.00011822393389593344, + "clip_ratio/region_mean": 0.0022215367353055626, + "epoch": 2.1749781277340334, + "grad_norm": 0.2628580927848816, + "learning_rate": 1e-06, + "loss": -0.0383, + "step": 931 + }, + { + "clip_ratio/high_max": 0.0022051738706068136, + "clip_ratio/high_mean": 0.000955240908297128, + "clip_ratio/low_mean": 0.0014340925736178178, + "clip_ratio/low_min": 0.0001505113068560604, + "clip_ratio/region_mean": 0.0023893334437161684, + "epoch": 2.1773111694371536, + "grad_norm": 0.26694920659065247, + "learning_rate": 1e-06, + "loss": -0.0384, + "step": 932 + }, + { + "clip_ratio/high_max": 0.002139591691957321, + "clip_ratio/high_mean": 0.0007941814437799621, + "clip_ratio/low_mean": 0.0005919145351072075, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013860959879821166, + "completions/clipped_ratio": 0.1283482142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3572.0, + "completions/mean_length": 1079.03125, + "completions/mean_terminated_length": 634.7913208007812, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 2.1796442111402743, + "grad_norm": 0.3061717748641968, + "learning_rate": 1e-06, + "loss": -0.044, + "num_tokens": 137881496.0, + "reward": 0.5089285969734192, + "reward_std": 0.16006723046302795, + "rewards/verify_math_reward/mean": 0.5089285969734192, + "rewards/verify_math_reward/std": 0.5001994967460632, + "step": 933 + }, + { + "clip_ratio/high_max": 0.0023438950229319744, + "clip_ratio/high_mean": 0.000926869637623895, + "clip_ratio/low_mean": 0.0009254118449462112, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018522815080359578, + "epoch": 2.1819772528433945, + "grad_norm": 0.24494388699531555, + "learning_rate": 1e-06, + "loss": -0.0442, + "step": 934 + }, + { + "clip_ratio/high_max": 0.0024403161805821583, + "clip_ratio/high_mean": 0.0009846319130701886, + "clip_ratio/low_mean": 0.0010780941793200327, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002062726103758905, + "epoch": 2.184310294546515, + "grad_norm": 0.21183714270591736, + "learning_rate": 1e-06, + "loss": -0.0444, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0020901129355479497, + "clip_ratio/high_mean": 0.0008741297460801434, + "clip_ratio/low_mean": 0.0011600875404838007, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002034217286563944, + "epoch": 2.1866433362496354, + "grad_norm": 0.24384094774723053, + "learning_rate": 1e-06, + "loss": -0.0443, + "step": 936 + }, + { + "clip_ratio/high_max": 0.001768199923390057, + "clip_ratio/high_mean": 0.000634681118754088, + "clip_ratio/low_mean": 0.0006225188540156523, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012571999686770141, + "completions/clipped_ratio": 0.1283482142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3038.0, + "completions/mean_length": 1036.712158203125, + "completions/mean_terminated_length": 586.24072265625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.188976377952756, + "grad_norm": 0.32948145270347595, + "learning_rate": 1e-06, + "loss": -0.0378, + "num_tokens": 138426646.0, + "reward": 0.5803571939468384, + "reward_std": 0.15818998217582703, + "rewards/verify_math_reward/mean": 0.5803571343421936, + "rewards/verify_math_reward/std": 0.4937761127948761, + "step": 937 + }, + { + "clip_ratio/high_max": 0.002090778416459216, + "clip_ratio/high_mean": 0.0008678984213474905, + "clip_ratio/low_mean": 0.0008075540590652963, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001675452458584914, + "epoch": 2.1913094196558762, + "grad_norm": 0.43128690123558044, + "learning_rate": 1e-06, + "loss": -0.038, + "step": 938 + }, + { + "clip_ratio/high_max": 0.0024906518374336883, + "clip_ratio/high_mean": 0.0009043059508258011, + "clip_ratio/low_mean": 0.0010591671789370594, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00196347308519762, + "epoch": 2.193642461358997, + "grad_norm": 0.21879757940769196, + "learning_rate": 1e-06, + "loss": -0.0383, + "step": 939 + }, + { + "clip_ratio/high_max": 0.0023015859405859374, + "clip_ratio/high_mean": 0.0008173199948942056, + "clip_ratio/low_mean": 0.001166840183941531, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019841601388179697, + "epoch": 2.195975503062117, + "grad_norm": 0.22823746502399445, + "learning_rate": 1e-06, + "loss": -0.0382, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0020483883927227, + "clip_ratio/high_mean": 0.0007574281025881646, + "clip_ratio/low_mean": 0.00047466508294746745, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012320932182774413, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3623.0, + "completions/mean_length": 1117.8504638671875, + "completions/mean_terminated_length": 630.516845703125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 2.1983085447652377, + "grad_norm": 0.30982983112335205, + "learning_rate": 1e-06, + "loss": -0.0259, + "num_tokens": 139001576.0, + "reward": 0.574776828289032, + "reward_std": 0.15090152621269226, + "rewards/verify_math_reward/mean": 0.5747767686843872, + "rewards/verify_math_reward/std": 0.49465295672416687, + "step": 941 + }, + { + "clip_ratio/high_max": 0.0023883046815171838, + "clip_ratio/high_mean": 0.0010080747797474032, + "clip_ratio/low_mean": 0.0007136938784242375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017217686836374924, + "epoch": 2.200641586468358, + "grad_norm": 0.29436051845550537, + "learning_rate": 1e-06, + "loss": -0.0261, + "step": 942 + }, + { + "clip_ratio/high_max": 0.0025798613351071253, + "clip_ratio/high_mean": 0.000991768474705168, + "clip_ratio/low_mean": 0.0008289616853289772, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001820730212784838, + "epoch": 2.2029746281714786, + "grad_norm": 0.22793766856193542, + "learning_rate": 1e-06, + "loss": -0.0262, + "step": 943 + }, + { + "clip_ratio/high_max": 0.0023700841338722967, + "clip_ratio/high_mean": 0.00098240349052503, + "clip_ratio/low_mean": 0.0010814499692060053, + "clip_ratio/low_min": 1.6587049685767852e-05, + "clip_ratio/region_mean": 0.0020638534988393076, + "epoch": 2.205307669874599, + "grad_norm": 0.26359623670578003, + "learning_rate": 1e-06, + "loss": -0.0263, + "step": 944 + }, + { + "clip_ratio/high_max": 0.001843147969339043, + "clip_ratio/high_mean": 0.0006260225245569018, + "clip_ratio/low_mean": 0.000724763362086378, + "clip_ratio/low_min": 4.9626807594904676e-05, + "clip_ratio/region_mean": 0.0013507858820958063, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2832.0, + "completions/mean_length": 1200.65185546875, + "completions/mean_terminated_length": 664.4761962890625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 2.2076407115777195, + "grad_norm": 0.33370280265808105, + "learning_rate": 1e-06, + "loss": -0.0472, + "num_tokens": 139587328.0, + "reward": 0.5334821939468384, + "reward_std": 0.15729349851608276, + "rewards/verify_math_reward/mean": 0.5334821343421936, + "rewards/verify_math_reward/std": 0.49915632605552673, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0019799328547378536, + "clip_ratio/high_mean": 0.0007350524265348213, + "clip_ratio/low_mean": 0.0010777700445032679, + "clip_ratio/low_min": 4.0787468606140465e-05, + "clip_ratio/region_mean": 0.0018128224764950573, + "epoch": 2.20997375328084, + "grad_norm": 0.22046437859535217, + "learning_rate": 1e-06, + "loss": -0.0474, + "step": 946 + }, + { + "clip_ratio/high_max": 0.002227421813586261, + "clip_ratio/high_mean": 0.0007772156586725032, + "clip_ratio/low_mean": 0.0011329547014611308, + "clip_ratio/low_min": 6.978042074479163e-05, + "clip_ratio/region_mean": 0.0019101703655906022, + "epoch": 2.2123067949839603, + "grad_norm": 0.25371313095092773, + "learning_rate": 1e-06, + "loss": -0.0475, + "step": 947 + }, + { + "clip_ratio/high_max": 0.002114128517860081, + "clip_ratio/high_mean": 0.0007653627180843614, + "clip_ratio/low_mean": 0.001256132894923212, + "clip_ratio/low_min": 5.815035183331929e-05, + "clip_ratio/region_mean": 0.002021495543885976, + "epoch": 2.214639836687081, + "grad_norm": 0.2694165110588074, + "learning_rate": 1e-06, + "loss": -0.0476, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0016557084200030658, + "clip_ratio/high_mean": 0.0005161424414836802, + "clip_ratio/low_mean": 0.000461753966192191, + "clip_ratio/low_min": 4.251655445841607e-05, + "clip_ratio/region_mean": 0.0009778964176803129, + "completions/clipped_ratio": 0.1417410714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3777.0, + "completions/mean_length": 1111.8170166015625, + "completions/mean_terminated_length": 618.98046875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 2.216972878390201, + "grad_norm": 0.276732474565506, + "learning_rate": 1e-06, + "loss": -0.041, + "num_tokens": 140145060.0, + "reward": 0.5212053656578064, + "reward_std": 0.13609540462493896, + "rewards/verify_math_reward/mean": 0.5212053656578064, + "rewards/verify_math_reward/std": 0.49982914328575134, + "step": 949 + }, + { + "clip_ratio/high_max": 0.002127289764757734, + "clip_ratio/high_mean": 0.0007239664619191899, + "clip_ratio/low_mean": 0.000746685138437897, + "clip_ratio/low_min": 4.861864908889402e-05, + "clip_ratio/region_mean": 0.001470651630370412, + "epoch": 2.219305920093322, + "grad_norm": 0.23614975810050964, + "learning_rate": 1e-06, + "loss": -0.0412, + "step": 950 + }, + { + "clip_ratio/high_max": 0.00227067967352923, + "clip_ratio/high_mean": 0.0007507983964387677, + "clip_ratio/low_mean": 0.0008739369732211344, + "clip_ratio/low_min": 4.017141691292636e-05, + "clip_ratio/region_mean": 0.0016247353378275875, + "epoch": 2.221638961796442, + "grad_norm": 0.18173855543136597, + "learning_rate": 1e-06, + "loss": -0.0414, + "step": 951 + }, + { + "clip_ratio/high_max": 0.002109660468704533, + "clip_ratio/high_mean": 0.000666342064505443, + "clip_ratio/low_mean": 0.001001883705612272, + "clip_ratio/low_min": 6.294832837738795e-05, + "clip_ratio/region_mean": 0.0016682257810316514, + "epoch": 2.2239720034995627, + "grad_norm": 0.2091158926486969, + "learning_rate": 1e-06, + "loss": -0.0414, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0026642042430466972, + "clip_ratio/high_mean": 0.0009831271636357997, + "clip_ratio/low_mean": 0.0006934364646440372, + "clip_ratio/low_min": 2.9322072805371135e-05, + "clip_ratio/region_mean": 0.0016765636173659004, + "completions/clipped_ratio": 0.1640625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3043.0, + "completions/mean_length": 1180.571533203125, + "completions/mean_terminated_length": 608.384521484375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 2.226305045202683, + "grad_norm": 0.3571631610393524, + "learning_rate": 1e-06, + "loss": -0.0407, + "num_tokens": 140691932.0, + "reward": 0.535714328289032, + "reward_std": 0.1716071218252182, + "rewards/verify_math_reward/mean": 0.5357142686843872, + "rewards/verify_math_reward/std": 0.4990014135837555, + "step": 953 + }, + { + "clip_ratio/high_max": 0.00303604166401783, + "clip_ratio/high_mean": 0.0010800326363096246, + "clip_ratio/low_mean": 0.0009275444572267588, + "clip_ratio/low_min": 6.189139821799472e-05, + "clip_ratio/region_mean": 0.0020075770444236696, + "epoch": 2.2286380869058036, + "grad_norm": 0.2970696985721588, + "learning_rate": 1e-06, + "loss": -0.0409, + "step": 954 + }, + { + "clip_ratio/high_max": 0.0035131540935253724, + "clip_ratio/high_mean": 0.0012164544277766254, + "clip_ratio/low_mean": 0.0012272803178348113, + "clip_ratio/low_min": 0.00010359116276958957, + "clip_ratio/region_mean": 0.0024437347674393095, + "epoch": 2.2309711286089238, + "grad_norm": 0.2447551190853119, + "learning_rate": 1e-06, + "loss": -0.0411, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0028052897614543326, + "clip_ratio/high_mean": 0.0010593809638521634, + "clip_ratio/low_mean": 0.0014988202528911643, + "clip_ratio/low_min": 0.0001340980379609391, + "clip_ratio/region_mean": 0.0025582012021914124, + "epoch": 2.2333041703120444, + "grad_norm": 0.2844543755054474, + "learning_rate": 1e-06, + "loss": -0.0412, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0024161854744306765, + "clip_ratio/high_mean": 0.0010049222473753616, + "clip_ratio/low_mean": 0.0004824429756808968, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014873652544338256, + "completions/clipped_ratio": 0.1294642857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3811.0, + "completions/mean_length": 1056.2723388671875, + "completions/mean_terminated_length": 604.2102661132812, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 2.2356372120151646, + "grad_norm": 0.3601495325565338, + "learning_rate": 1e-06, + "loss": -0.044, + "num_tokens": 141238360.0, + "reward": 0.6484375, + "reward_std": 0.16450665891170502, + "rewards/verify_math_reward/mean": 0.6484375, + "rewards/verify_math_reward/std": 0.4777248501777649, + "step": 957 + }, + { + "clip_ratio/high_max": 0.0032573289645370096, + "clip_ratio/high_mean": 0.0013352053574635647, + "clip_ratio/low_mean": 0.0007881219216869795, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021233272418612614, + "epoch": 2.2379702537182853, + "grad_norm": 0.2749025523662567, + "learning_rate": 1e-06, + "loss": -0.0442, + "step": 958 + }, + { + "clip_ratio/high_max": 0.003161405933497008, + "clip_ratio/high_mean": 0.0013238063256721944, + "clip_ratio/low_mean": 0.0008990651913336478, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002222871538833715, + "epoch": 2.2403032954214055, + "grad_norm": 0.24613353610038757, + "learning_rate": 1e-06, + "loss": -0.0444, + "step": 959 + }, + { + "clip_ratio/high_max": 0.002812174214341212, + "clip_ratio/high_mean": 0.0011950203588639852, + "clip_ratio/low_mean": 0.0010793403234856669, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002274360682349652, + "epoch": 2.242636337124526, + "grad_norm": 0.35043787956237793, + "learning_rate": 1e-06, + "loss": -0.0445, + "step": 960 + }, + { + "clip_ratio/high_max": 0.002419755546725355, + "clip_ratio/high_mean": 0.0008969339869508985, + "clip_ratio/low_mean": 0.00043708002590392425, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013340140030777548, + "completions/clipped_ratio": 0.1540178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2158.0, + "completions/mean_length": 1127.540283203125, + "completions/mean_terminated_length": 587.1082153320312, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 2.2449693788276464, + "grad_norm": 0.3806179463863373, + "learning_rate": 1e-06, + "loss": -0.0803, + "num_tokens": 141771868.0, + "reward": 0.5870535969734192, + "reward_std": 0.16172580420970917, + "rewards/verify_math_reward/mean": 0.5870535969734192, + "rewards/verify_math_reward/std": 0.49263834953308105, + "step": 961 + }, + { + "clip_ratio/high_max": 0.0030022057762835175, + "clip_ratio/high_mean": 0.001246008978341706, + "clip_ratio/low_mean": 0.000651918781841232, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018979277519974858, + "epoch": 2.247302420530767, + "grad_norm": 0.27406615018844604, + "learning_rate": 1e-06, + "loss": -0.0807, + "step": 962 + }, + { + "clip_ratio/high_max": 0.0028254452408873476, + "clip_ratio/high_mean": 0.0011592298033065163, + "clip_ratio/low_mean": 0.0008223621834986261, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019815919440588914, + "epoch": 2.249635462233887, + "grad_norm": 0.26929888129234314, + "learning_rate": 1e-06, + "loss": -0.0808, + "step": 963 + }, + { + "clip_ratio/high_max": 0.0026136957967537455, + "clip_ratio/high_mean": 0.0010887508469750173, + "clip_ratio/low_mean": 0.0009752080068210489, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020639588838093914, + "epoch": 2.251968503937008, + "grad_norm": 0.27124956250190735, + "learning_rate": 1e-06, + "loss": -0.0809, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0022181659478519578, + "clip_ratio/high_mean": 0.000915030407213635, + "clip_ratio/low_mean": 0.0006308629726845538, + "clip_ratio/low_min": 2.5375558834639378e-05, + "clip_ratio/region_mean": 0.0015458933776244521, + "completions/clipped_ratio": 0.1116071428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2528.0, + "completions/mean_length": 1011.0848388671875, + "completions/mean_terminated_length": 623.5326538085938, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 2.2543015456401285, + "grad_norm": 0.3349124789237976, + "learning_rate": 1e-06, + "loss": -0.0567, + "num_tokens": 142348168.0, + "reward": 0.6261160969734192, + "reward_std": 0.1795709878206253, + "rewards/verify_math_reward/mean": 0.6261160969734192, + "rewards/verify_math_reward/std": 0.48410359025001526, + "step": 965 + }, + { + "clip_ratio/high_max": 0.002503495335986372, + "clip_ratio/high_mean": 0.0010731359543569852, + "clip_ratio/low_mean": 0.000809296878742316, + "clip_ratio/low_min": 9.833228432398755e-06, + "clip_ratio/region_mean": 0.0018824328071787022, + "epoch": 2.2566345873432487, + "grad_norm": 0.2904164493083954, + "learning_rate": 1e-06, + "loss": -0.0569, + "step": 966 + }, + { + "clip_ratio/high_max": 0.0024661056631885003, + "clip_ratio/high_mean": 0.0010312086633348372, + "clip_ratio/low_mean": 0.0010948240997095127, + "clip_ratio/low_min": 2.2986392650636844e-05, + "clip_ratio/region_mean": 0.0021260327084746677, + "epoch": 2.2589676290463694, + "grad_norm": 0.2515934109687805, + "learning_rate": 1e-06, + "loss": -0.0571, + "step": 967 + }, + { + "clip_ratio/high_max": 0.0027019068111258093, + "clip_ratio/high_mean": 0.00110522161321569, + "clip_ratio/low_mean": 0.0012917765252495883, + "clip_ratio/low_min": 6.47084725642344e-05, + "clip_ratio/region_mean": 0.0023969980975380167, + "epoch": 2.2613006707494896, + "grad_norm": 0.2741507887840271, + "learning_rate": 1e-06, + "loss": -0.0572, + "step": 968 + }, + { + "clip_ratio/high_max": 0.002199630605900893, + "clip_ratio/high_mean": 0.0008182023684639717, + "clip_ratio/low_mean": 0.0006519424650832661, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014701447835250292, + "completions/clipped_ratio": 0.1082589285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3819.0, + "completions/mean_length": 1000.7511596679688, + "completions/mean_terminated_length": 624.9824829101562, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 2.2636337124526102, + "grad_norm": 0.3289185166358948, + "learning_rate": 1e-06, + "loss": -0.0485, + "num_tokens": 142933185.0, + "reward": 0.598214328289032, + "reward_std": 0.16198793053627014, + "rewards/verify_math_reward/mean": 0.5982142686843872, + "rewards/verify_math_reward/std": 0.49053287506103516, + "step": 969 + }, + { + "clip_ratio/high_max": 0.0025676776858745143, + "clip_ratio/high_mean": 0.0009590076369931921, + "clip_ratio/low_mean": 0.0008741687543079024, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018331763858441263, + "epoch": 2.2659667541557305, + "grad_norm": 0.2584741711616516, + "learning_rate": 1e-06, + "loss": -0.0487, + "step": 970 + }, + { + "clip_ratio/high_max": 0.002799642810714431, + "clip_ratio/high_mean": 0.0010030778466898482, + "clip_ratio/low_mean": 0.0010420067264931276, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020450845404411666, + "epoch": 2.268299795858851, + "grad_norm": 0.23060286045074463, + "learning_rate": 1e-06, + "loss": -0.0489, + "step": 971 + }, + { + "clip_ratio/high_max": 0.002635231299791485, + "clip_ratio/high_mean": 0.0009523928201815579, + "clip_ratio/low_mean": 0.001234156496138894, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021865493181394413, + "epoch": 2.2706328375619713, + "grad_norm": 0.23002904653549194, + "learning_rate": 1e-06, + "loss": -0.049, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0028391991509124637, + "clip_ratio/high_mean": 0.001106439893192146, + "clip_ratio/low_mean": 0.0006090573533583665, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017154972338175867, + "completions/clipped_ratio": 0.1439732142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2396.0, + "completions/mean_length": 1066.6551513671875, + "completions/mean_terminated_length": 557.1564331054688, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 2.272965879265092, + "grad_norm": 0.3089137673377991, + "learning_rate": 1e-06, + "loss": -0.0821, + "num_tokens": 143437588.0, + "reward": 0.645089328289032, + "reward_std": 0.15808121860027313, + "rewards/verify_math_reward/mean": 0.6450892686843872, + "rewards/verify_math_reward/std": 0.4787535071372986, + "step": 973 + }, + { + "clip_ratio/high_max": 0.0032532387995161116, + "clip_ratio/high_mean": 0.0012893593338958453, + "clip_ratio/low_mean": 0.0007900426817286643, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020794020492758136, + "epoch": 2.275298920968212, + "grad_norm": 0.2660689949989319, + "learning_rate": 1e-06, + "loss": -0.0822, + "step": 974 + }, + { + "clip_ratio/high_max": 0.0037110813573235646, + "clip_ratio/high_mean": 0.0013476625608745962, + "clip_ratio/low_mean": 0.0009197242743539391, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00226738685159944, + "epoch": 2.277631962671333, + "grad_norm": 0.2527943253517151, + "learning_rate": 1e-06, + "loss": -0.0825, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0033753456664271653, + "clip_ratio/high_mean": 0.001257763993635308, + "clip_ratio/low_mean": 0.0011084291199949803, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002366193075431511, + "epoch": 2.279965004374453, + "grad_norm": 0.2624468505382538, + "learning_rate": 1e-06, + "loss": -0.0825, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0023874520557001233, + "clip_ratio/high_mean": 0.0010449714463902637, + "clip_ratio/low_mean": 0.0005359301885619061, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015809016913408414, + "completions/clipped_ratio": 0.1674107142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3252.0, + "completions/mean_length": 1222.703125, + "completions/mean_terminated_length": 644.9624633789062, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 2.2822980460775737, + "grad_norm": 0.36057326197624207, + "learning_rate": 1e-06, + "loss": -0.0536, + "num_tokens": 144001986.0, + "reward": 0.578125, + "reward_std": 0.17299975454807281, + "rewards/verify_math_reward/mean": 0.578125, + "rewards/verify_math_reward/std": 0.4941346049308777, + "step": 977 + }, + { + "clip_ratio/high_max": 0.0027978644429822452, + "clip_ratio/high_mean": 0.0011361790857336018, + "clip_ratio/low_mean": 0.0007619837706442922, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018981628454639576, + "epoch": 2.284631087780694, + "grad_norm": 0.2731843888759613, + "learning_rate": 1e-06, + "loss": -0.0537, + "step": 978 + }, + { + "clip_ratio/high_max": 0.002700283053854946, + "clip_ratio/high_mean": 0.0011772612706408836, + "clip_ratio/low_mean": 0.000963448102993425, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002140709380910266, + "epoch": 2.2869641294838146, + "grad_norm": 0.2100088894367218, + "learning_rate": 1e-06, + "loss": -0.0539, + "step": 979 + }, + { + "clip_ratio/high_max": 0.00309593380370643, + "clip_ratio/high_mean": 0.0012225247410242446, + "clip_ratio/low_mean": 0.001080663063476095, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002303187829966191, + "epoch": 2.289297171186935, + "grad_norm": 0.21596242487430573, + "learning_rate": 1e-06, + "loss": -0.0539, + "step": 980 + }, + { + "clip_ratio/high_max": 0.002694579445233103, + "clip_ratio/high_mean": 0.0012348649215709884, + "clip_ratio/low_mean": 0.0006861911151645472, + "clip_ratio/low_min": 4.292476296541281e-05, + "clip_ratio/region_mean": 0.0019210560130886734, + "completions/clipped_ratio": 0.1595982142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4012.0, + "completions/mean_length": 1203.9710693359375, + "completions/mean_terminated_length": 654.7543334960938, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 2.2916302128900554, + "grad_norm": 0.28912052512168884, + "learning_rate": 1e-06, + "loss": -0.0814, + "num_tokens": 144588008.0, + "reward": 0.5167410969734192, + "reward_std": 0.20376215875148773, + "rewards/verify_math_reward/mean": 0.5167410969734192, + "rewards/verify_math_reward/std": 0.4999987483024597, + "step": 981 + }, + { + "clip_ratio/high_max": 0.002998442665557377, + "clip_ratio/high_mean": 0.0013558516875491478, + "clip_ratio/low_mean": 0.0009762507397681475, + "clip_ratio/low_min": 6.391544957295991e-05, + "clip_ratio/region_mean": 0.0023321025000768714, + "epoch": 2.2939632545931756, + "grad_norm": 0.2508056163787842, + "learning_rate": 1e-06, + "loss": -0.0816, + "step": 982 + }, + { + "clip_ratio/high_max": 0.0032239833162748255, + "clip_ratio/high_mean": 0.001394302129483549, + "clip_ratio/low_mean": 0.0011156422642670805, + "clip_ratio/low_min": 7.90116610005498e-05, + "clip_ratio/region_mean": 0.002509944373741746, + "epoch": 2.2962962962962963, + "grad_norm": 0.2622165083885193, + "learning_rate": 1e-06, + "loss": -0.0818, + "step": 983 + }, + { + "clip_ratio/high_max": 0.0029617280670208856, + "clip_ratio/high_mean": 0.0013805006765323924, + "clip_ratio/low_mean": 0.0013676582530024461, + "clip_ratio/low_min": 9.943220175046008e-05, + "clip_ratio/region_mean": 0.0027481589568196796, + "epoch": 2.298629337999417, + "grad_norm": 0.2608332335948944, + "learning_rate": 1e-06, + "loss": -0.0819, + "step": 984 + }, + { + "clip_ratio/high_max": 0.002652029063028749, + "clip_ratio/high_mean": 0.0007925968584459042, + "clip_ratio/low_mean": 0.0007248047786561074, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015174016079981811, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2870.0, + "completions/mean_length": 1135.5882568359375, + "completions/mean_terminated_length": 651.1571044921875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 2.300962379702537, + "grad_norm": 0.3000766336917877, + "learning_rate": 1e-06, + "loss": -0.0537, + "num_tokens": 145185087.0, + "reward": 0.559151828289032, + "reward_std": 0.1624336987733841, + "rewards/verify_math_reward/mean": 0.5591517686843872, + "rewards/verify_math_reward/std": 0.496766060590744, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0030103262324701063, + "clip_ratio/high_mean": 0.0010665687896107556, + "clip_ratio/low_mean": 0.0009453985421714606, + "clip_ratio/low_min": 4.060199353261851e-05, + "clip_ratio/region_mean": 0.002011967313592322, + "epoch": 2.303295421405658, + "grad_norm": 0.2218482494354248, + "learning_rate": 1e-06, + "loss": -0.054, + "step": 986 + }, + { + "clip_ratio/high_max": 0.002894998098781798, + "clip_ratio/high_mean": 0.0009617551313567674, + "clip_ratio/low_mean": 0.0010639281244948506, + "clip_ratio/low_min": 1.2250097825017292e-05, + "clip_ratio/region_mean": 0.002025683337706141, + "epoch": 2.305628463108778, + "grad_norm": 0.22861462831497192, + "learning_rate": 1e-06, + "loss": -0.0541, + "step": 987 + }, + { + "clip_ratio/high_max": 0.0030679903575219214, + "clip_ratio/high_mean": 0.0010229348026769003, + "clip_ratio/low_mean": 0.0012137892845203169, + "clip_ratio/low_min": 1.2250097825017292e-05, + "clip_ratio/region_mean": 0.002236724059912376, + "epoch": 2.3079615048118987, + "grad_norm": 0.2555590569972992, + "learning_rate": 1e-06, + "loss": -0.0542, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0031955152153386734, + "clip_ratio/high_mean": 0.0011687080059346044, + "clip_ratio/low_mean": 0.0006758173572052328, + "clip_ratio/low_min": 2.572965604485944e-05, + "clip_ratio/region_mean": 0.0018445253372192383, + "completions/clipped_ratio": 0.1662946428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2483.0, + "completions/mean_length": 1158.669677734375, + "completions/mean_terminated_length": 572.7764282226562, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 2.310294546515019, + "grad_norm": 0.3772180378437042, + "learning_rate": 1e-06, + "loss": -0.066, + "num_tokens": 145703951.0, + "reward": 0.5558035969734192, + "reward_std": 0.1875448375940323, + "rewards/verify_math_reward/mean": 0.5558035969734192, + "rewards/verify_math_reward/std": 0.49715372920036316, + "step": 989 + }, + { + "clip_ratio/high_max": 0.0033459850965300575, + "clip_ratio/high_mean": 0.001259680666407803, + "clip_ratio/low_mean": 0.0010045460803667083, + "clip_ratio/low_min": 3.2691279557184316e-05, + "clip_ratio/region_mean": 0.0022642267504124902, + "epoch": 2.3126275882181395, + "grad_norm": 0.29148250818252563, + "learning_rate": 1e-06, + "loss": -0.0664, + "step": 990 + }, + { + "clip_ratio/high_max": 0.003272478556027636, + "clip_ratio/high_mean": 0.0013310912727320101, + "clip_ratio/low_mean": 0.001179783198494988, + "clip_ratio/low_min": 2.8312570066191256e-05, + "clip_ratio/region_mean": 0.0025108745030593127, + "epoch": 2.3149606299212597, + "grad_norm": 0.3249438405036926, + "learning_rate": 1e-06, + "loss": -0.0665, + "step": 991 + }, + { + "clip_ratio/high_max": 0.0033494883828097954, + "clip_ratio/high_mean": 0.0013296841680130456, + "clip_ratio/low_mean": 0.0014279158840508899, + "clip_ratio/low_min": 8.391755181946792e-05, + "clip_ratio/region_mean": 0.0027576000647968613, + "epoch": 2.3172936716243804, + "grad_norm": 0.2720489203929901, + "learning_rate": 1e-06, + "loss": -0.0667, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0027341580935171805, + "clip_ratio/high_mean": 0.0010409573660581373, + "clip_ratio/low_mean": 0.000581146584408998, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001622103933186736, + "completions/clipped_ratio": 0.1529017857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3695.0, + "completions/mean_length": 1190.938720703125, + "completions/mean_terminated_length": 666.5731201171875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 2.3196267133275006, + "grad_norm": 0.39617666602134705, + "learning_rate": 1e-06, + "loss": -0.0477, + "num_tokens": 146300536.0, + "reward": 0.5390625, + "reward_std": 0.16044628620147705, + "rewards/verify_math_reward/mean": 0.5390625, + "rewards/verify_math_reward/std": 0.4987502098083496, + "step": 993 + }, + { + "clip_ratio/high_max": 0.0033796231291489676, + "clip_ratio/high_mean": 0.001176894274976803, + "clip_ratio/low_mean": 0.0008453242789983051, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020222185557940975, + "epoch": 2.3219597550306212, + "grad_norm": 0.31302952766418457, + "learning_rate": 1e-06, + "loss": -0.0478, + "step": 994 + }, + { + "clip_ratio/high_max": 0.0033353502658428624, + "clip_ratio/high_mean": 0.0012012331644655205, + "clip_ratio/low_mean": 0.0009508791226835456, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021521122835110873, + "epoch": 2.3242927967337415, + "grad_norm": 0.2497008889913559, + "learning_rate": 1e-06, + "loss": -0.0481, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0029132952477084473, + "clip_ratio/high_mean": 0.0010657403581717517, + "clip_ratio/low_mean": 0.0011637679217528785, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002229508289019577, + "epoch": 2.326625838436862, + "grad_norm": 0.224105566740036, + "learning_rate": 1e-06, + "loss": -0.0482, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0020946715885656886, + "clip_ratio/high_mean": 0.0008001404403330525, + "clip_ratio/low_mean": 0.00042710497018561, + "clip_ratio/low_min": 1.6129031791933812e-05, + "clip_ratio/region_mean": 0.0012272453859623056, + "completions/clipped_ratio": 0.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3185.0, + "completions/mean_length": 1075.9140625, + "completions/mean_terminated_length": 644.4732055664062, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 2.3289588801399823, + "grad_norm": 0.2584588825702667, + "learning_rate": 1e-06, + "loss": -0.0593, + "num_tokens": 146888867.0, + "reward": 0.6071428656578064, + "reward_std": 0.15706203877925873, + "rewards/verify_math_reward/mean": 0.6071428656578064, + "rewards/verify_math_reward/std": 0.48865827918052673, + "step": 997 + }, + { + "clip_ratio/high_max": 0.002415469105471857, + "clip_ratio/high_mean": 0.0009024233272612037, + "clip_ratio/low_mean": 0.0006431267738662427, + "clip_ratio/low_min": 5.172770397621207e-05, + "clip_ratio/region_mean": 0.0015455501197720878, + "epoch": 2.331291921843103, + "grad_norm": 0.29324856400489807, + "learning_rate": 1e-06, + "loss": -0.0595, + "step": 998 + }, + { + "clip_ratio/high_max": 0.0024191764314309694, + "clip_ratio/high_mean": 0.0009480474327574484, + "clip_ratio/low_mean": 0.0008097914069367107, + "clip_ratio/low_min": 1.6129031791933812e-05, + "clip_ratio/region_mean": 0.0017578388178662863, + "epoch": 2.3336249635462236, + "grad_norm": 0.20104141533374786, + "learning_rate": 1e-06, + "loss": -0.0596, + "step": 999 + }, + { + "clip_ratio/high_max": 0.0022805830703873653, + "clip_ratio/high_mean": 0.0008394501201109961, + "clip_ratio/low_mean": 0.0008450898640148807, + "clip_ratio/low_min": 6.451612716773525e-05, + "clip_ratio/region_mean": 0.0016845399732119404, + "epoch": 2.335958005249344, + "grad_norm": 0.20965102314949036, + "learning_rate": 1e-06, + "loss": -0.0597, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.002165381643862929, + "clip_ratio/high_mean": 0.000781822514909436, + "clip_ratio/low_mean": 0.0005456448689074023, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013274673838168383, + "completions/clipped_ratio": 0.1573660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3491.0, + "completions/mean_length": 1153.9107666015625, + "completions/mean_terminated_length": 604.4609375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 2.338291046952464, + "grad_norm": 0.3027136027812958, + "learning_rate": 1e-06, + "loss": -0.0542, + "num_tokens": 147437075.0, + "reward": 0.5334821939468384, + "reward_std": 0.13500885665416718, + "rewards/verify_math_reward/mean": 0.5334821343421936, + "rewards/verify_math_reward/std": 0.49915632605552673, + "step": 1001 + }, + { + "clip_ratio/high_max": 0.0026408979974803515, + "clip_ratio/high_mean": 0.0009627417293813778, + "clip_ratio/low_mean": 0.0007521888182964176, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017149305313068908, + "epoch": 2.3406240886555847, + "grad_norm": 0.25148019194602966, + "learning_rate": 1e-06, + "loss": -0.0543, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.0027020912239095196, + "clip_ratio/high_mean": 0.0009959432627510978, + "clip_ratio/low_mean": 0.0009145451731455978, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019104883976979181, + "epoch": 2.3429571303587053, + "grad_norm": 0.20745614171028137, + "learning_rate": 1e-06, + "loss": -0.0545, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.002835997562215198, + "clip_ratio/high_mean": 0.000995350506855175, + "clip_ratio/low_mean": 0.0011339418560964987, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002129292370227631, + "epoch": 2.3452901720618256, + "grad_norm": 0.23285433650016785, + "learning_rate": 1e-06, + "loss": -0.0546, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0025864409108180553, + "clip_ratio/high_mean": 0.0009867948083410738, + "clip_ratio/low_mean": 0.0006698022280033911, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016565970581723377, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2938.0, + "completions/mean_length": 1144.094970703125, + "completions/mean_terminated_length": 597.4457397460938, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 2.347623213764946, + "grad_norm": 0.32017406821250916, + "learning_rate": 1e-06, + "loss": -0.0705, + "num_tokens": 147974072.0, + "reward": 0.5691964626312256, + "reward_std": 0.17277081310749054, + "rewards/verify_math_reward/mean": 0.5691964030265808, + "rewards/verify_math_reward/std": 0.4954652488231659, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.003541181984473951, + "clip_ratio/high_mean": 0.0012657882925850572, + "clip_ratio/low_mean": 0.0010236859016004018, + "clip_ratio/low_min": 2.3710166715318337e-05, + "clip_ratio/region_mean": 0.0022894742214703, + "epoch": 2.3499562554680664, + "grad_norm": 0.2364875078201294, + "learning_rate": 1e-06, + "loss": -0.0709, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.0031734051372040994, + "clip_ratio/high_mean": 0.0012652313234866597, + "clip_ratio/low_mean": 0.0011457300988695351, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002410961387795396, + "epoch": 2.352289297171187, + "grad_norm": 0.245115265250206, + "learning_rate": 1e-06, + "loss": -0.0709, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.0032782024645712227, + "clip_ratio/high_mean": 0.0012094823105144314, + "clip_ratio/low_mean": 0.0013976999325677752, + "clip_ratio/low_min": 2.3710166715318337e-05, + "clip_ratio/region_mean": 0.002607182163046673, + "epoch": 2.3546223388743073, + "grad_norm": 0.21835707128047943, + "learning_rate": 1e-06, + "loss": -0.0711, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0016837205039337277, + "clip_ratio/high_mean": 0.0006111365655669942, + "clip_ratio/low_mean": 0.00039568928423250327, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010068258379760664, + "completions/clipped_ratio": 0.1450892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2747.0, + "completions/mean_length": 1125.196533203125, + "completions/mean_terminated_length": 621.0130615234375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 2.356955380577428, + "grad_norm": 0.2512587904930115, + "learning_rate": 1e-06, + "loss": -0.0546, + "num_tokens": 148534904.0, + "reward": 0.629464328289032, + "reward_std": 0.13534656167030334, + "rewards/verify_math_reward/mean": 0.6294642686843872, + "rewards/verify_math_reward/std": 0.4832179844379425, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.002148231546016177, + "clip_ratio/high_mean": 0.0008400733331654919, + "clip_ratio/low_mean": 0.0005824126833431365, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014224859842215665, + "epoch": 2.359288422280548, + "grad_norm": 0.18119673430919647, + "learning_rate": 1e-06, + "loss": -0.0548, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0024765461348579265, + "clip_ratio/high_mean": 0.0009033664991875412, + "clip_ratio/low_mean": 0.0006529972429234476, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015563637716695666, + "epoch": 2.361621463983669, + "grad_norm": 0.18178009986877441, + "learning_rate": 1e-06, + "loss": -0.0549, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.0024374165113840718, + "clip_ratio/high_mean": 0.000785118581916322, + "clip_ratio/low_mean": 0.0007713841982877057, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015565027970296796, + "epoch": 2.363954505686789, + "grad_norm": 0.16140949726104736, + "learning_rate": 1e-06, + "loss": -0.055, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.001982963345653843, + "clip_ratio/high_mean": 0.0006908731138537405, + "clip_ratio/low_mean": 0.0006206093303262605, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013114824396325275, + "completions/clipped_ratio": 0.1450892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2940.0, + "completions/mean_length": 1175.548095703125, + "completions/mean_terminated_length": 679.909912109375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 2.3662875473899097, + "grad_norm": 0.28620150685310364, + "learning_rate": 1e-06, + "loss": -0.0503, + "num_tokens": 149138171.0, + "reward": 0.5613839626312256, + "reward_std": 0.13654935359954834, + "rewards/verify_math_reward/mean": 0.5613839030265808, + "rewards/verify_math_reward/std": 0.496494859457016, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.0024065280231297947, + "clip_ratio/high_mean": 0.0007314348376894486, + "clip_ratio/low_mean": 0.000729183364455821, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014606182121497113, + "epoch": 2.36862058909303, + "grad_norm": 0.20217272639274597, + "learning_rate": 1e-06, + "loss": -0.0505, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.002407199404842686, + "clip_ratio/high_mean": 0.0007553441428171936, + "clip_ratio/low_mean": 0.0008421047932642978, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015974489360814914, + "epoch": 2.3709536307961505, + "grad_norm": 0.22331973910331726, + "learning_rate": 1e-06, + "loss": -0.0505, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0020915623936161865, + "clip_ratio/high_mean": 0.0007299595235963352, + "clip_ratio/low_mean": 0.0010612890218908433, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017912485091073904, + "epoch": 2.3732866724992707, + "grad_norm": 0.20704081654548645, + "learning_rate": 1e-06, + "loss": -0.0506, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0034728839236777276, + "clip_ratio/high_mean": 0.0011303817245789105, + "clip_ratio/low_mean": 0.0005705017920263344, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017008835238812026, + "completions/clipped_ratio": 0.1238839285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3766.0, + "completions/mean_length": 1060.7645263671875, + "completions/mean_terminated_length": 631.578369140625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 2.3756197142023914, + "grad_norm": 0.36774662137031555, + "learning_rate": 1e-06, + "loss": -0.057, + "num_tokens": 149718992.0, + "reward": 0.6171875, + "reward_std": 0.1595052033662796, + "rewards/verify_math_reward/mean": 0.6171875, + "rewards/verify_math_reward/std": 0.4863446056842804, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.0044531360326800495, + "clip_ratio/high_mean": 0.0013801455934299156, + "clip_ratio/low_mean": 0.000763638318858284, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002143783975043334, + "epoch": 2.377952755905512, + "grad_norm": 0.28642094135284424, + "learning_rate": 1e-06, + "loss": -0.0574, + "step": 1018 + }, + { + "clip_ratio/high_max": 0.005310886037477758, + "clip_ratio/high_mean": 0.0015749102640256751, + "clip_ratio/low_mean": 0.0009763622192622279, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0025512724605505355, + "epoch": 2.3802857976086322, + "grad_norm": 0.30652979016304016, + "learning_rate": 1e-06, + "loss": -0.0576, + "step": 1019 + }, + { + "clip_ratio/high_max": 0.005440009721496608, + "clip_ratio/high_mean": 0.001479281030697166, + "clip_ratio/low_mean": 0.0011621802077570464, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026414612657390535, + "epoch": 2.382618839311753, + "grad_norm": 0.2900792062282562, + "learning_rate": 1e-06, + "loss": -0.0576, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0016350403457181528, + "clip_ratio/high_mean": 0.0006149514538265066, + "clip_ratio/low_mean": 0.00037639642846443166, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.000991347873423365, + "completions/clipped_ratio": 0.1227678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3706.0, + "completions/mean_length": 1115.727783203125, + "completions/mean_terminated_length": 698.6412353515625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 2.384951881014873, + "grad_norm": 0.22175031900405884, + "learning_rate": 1e-06, + "loss": -0.0308, + "num_tokens": 150361820.0, + "reward": 0.5491071939468384, + "reward_std": 0.13354600965976715, + "rewards/verify_math_reward/mean": 0.5491071343421936, + "rewards/verify_math_reward/std": 0.49786055088043213, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.001988006792089436, + "clip_ratio/high_mean": 0.000717979422006465, + "clip_ratio/low_mean": 0.0005121810263517546, + "clip_ratio/low_min": 1.3724198652198538e-05, + "clip_ratio/region_mean": 0.0012301604547246825, + "epoch": 2.3872849227179938, + "grad_norm": 0.21414272487163544, + "learning_rate": 1e-06, + "loss": -0.0309, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.0021304474903445225, + "clip_ratio/high_mean": 0.0008804438002698589, + "clip_ratio/low_mean": 0.000570450026771141, + "clip_ratio/low_min": 1.941295158758294e-05, + "clip_ratio/region_mean": 0.001450893840228673, + "epoch": 2.389617964421114, + "grad_norm": 0.22635546326637268, + "learning_rate": 1e-06, + "loss": -0.0311, + "step": 1023 + }, + { + "clip_ratio/high_max": 0.0018546131504990626, + "clip_ratio/high_mean": 0.000691392498993082, + "clip_ratio/low_mean": 0.000757499012252083, + "clip_ratio/low_min": 2.911942829086911e-05, + "clip_ratio/region_mean": 0.0014488915076071862, + "epoch": 2.3919510061242346, + "grad_norm": 0.17617785930633545, + "learning_rate": 1e-06, + "loss": -0.0311, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.002508030127501115, + "clip_ratio/high_mean": 0.0010209910560661228, + "clip_ratio/low_mean": 0.0005794368807983119, + "clip_ratio/low_min": 1.5337423974415287e-05, + "clip_ratio/region_mean": 0.0016004279714252334, + "completions/clipped_ratio": 0.1339285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4045.0, + "completions/mean_length": 1116.724365234375, + "completions/mean_terminated_length": 656.0115966796875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 2.394284047827355, + "grad_norm": 0.2999490201473236, + "learning_rate": 1e-06, + "loss": -0.055, + "num_tokens": 150959397.0, + "reward": 0.578125, + "reward_std": 0.17739249765872955, + "rewards/verify_math_reward/mean": 0.578125, + "rewards/verify_math_reward/std": 0.4941346049308777, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.002997482515638694, + "clip_ratio/high_mean": 0.0012299517911742441, + "clip_ratio/low_mean": 0.0007308456479222514, + "clip_ratio/low_min": 1.5337423974415287e-05, + "clip_ratio/region_mean": 0.001960797468200326, + "epoch": 2.3966170895304755, + "grad_norm": 0.2786708176136017, + "learning_rate": 1e-06, + "loss": -0.0553, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.003061028866795823, + "clip_ratio/high_mean": 0.0012740095735352952, + "clip_ratio/low_mean": 0.0009666674104664708, + "clip_ratio/low_min": 1.5337423974415287e-05, + "clip_ratio/region_mean": 0.002240676956716925, + "epoch": 2.3989501312335957, + "grad_norm": 0.30744946002960205, + "learning_rate": 1e-06, + "loss": -0.0554, + "step": 1027 + }, + { + "clip_ratio/high_max": 0.0029087264701956883, + "clip_ratio/high_mean": 0.0011492716112115886, + "clip_ratio/low_mean": 0.0011242707532801433, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022735423772246577, + "epoch": 2.4012831729367163, + "grad_norm": 0.19937247037887573, + "learning_rate": 1e-06, + "loss": -0.0555, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0026239179860567674, + "clip_ratio/high_mean": 0.0011437333923822735, + "clip_ratio/low_mean": 0.0006050125793990446, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017487459990661591, + "completions/clipped_ratio": 0.1205357142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3868.0, + "completions/mean_length": 1022.3281860351562, + "completions/mean_terminated_length": 601.0634155273438, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 2.4036162146398365, + "grad_norm": 0.3392671048641205, + "learning_rate": 1e-06, + "loss": -0.0527, + "num_tokens": 151517939.0, + "reward": 0.625, + "reward_std": 0.1799846738576889, + "rewards/verify_math_reward/mean": 0.625, + "rewards/verify_math_reward/std": 0.48439329862594604, + "step": 1029 + }, + { + "clip_ratio/high_max": 0.002875018493796233, + "clip_ratio/high_mean": 0.0012252770939085167, + "clip_ratio/low_mean": 0.0008593679367550067, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002084644991555251, + "epoch": 2.405949256342957, + "grad_norm": 0.3104827404022217, + "learning_rate": 1e-06, + "loss": -0.053, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0031917464584694244, + "clip_ratio/high_mean": 0.001287612994929077, + "clip_ratio/low_mean": 0.0010495392143639037, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023371522111119702, + "epoch": 2.4082822980460774, + "grad_norm": 0.29952681064605713, + "learning_rate": 1e-06, + "loss": -0.0532, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.0029809527841280214, + "clip_ratio/high_mean": 0.0011874449846800417, + "clip_ratio/low_mean": 0.0012573567855724832, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024448017647955567, + "epoch": 2.410615339749198, + "grad_norm": 0.24343644082546234, + "learning_rate": 1e-06, + "loss": -0.0533, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.002099279656249564, + "clip_ratio/high_mean": 0.0008029598102439195, + "clip_ratio/low_mean": 0.000499091003803187, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013020508013141807, + "completions/clipped_ratio": 0.1149553571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3196.0, + "completions/mean_length": 1026.536865234375, + "completions/mean_terminated_length": 627.85498046875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 2.4129483814523183, + "grad_norm": 0.28373295068740845, + "learning_rate": 1e-06, + "loss": -0.0644, + "num_tokens": 152107364.0, + "reward": 0.6160714626312256, + "reward_std": 0.16743192076683044, + "rewards/verify_math_reward/mean": 0.6160714030265808, + "rewards/verify_math_reward/std": 0.486612468957901, + "step": 1033 + }, + { + "clip_ratio/high_max": 0.0028957660470041446, + "clip_ratio/high_mean": 0.0010994240874424577, + "clip_ratio/low_mean": 0.000717490160241141, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018169142858823761, + "epoch": 2.415281423155439, + "grad_norm": 0.22463740408420563, + "learning_rate": 1e-06, + "loss": -0.0646, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.0024197215352614876, + "clip_ratio/high_mean": 0.0009790176372916903, + "clip_ratio/low_mean": 0.0008822561085253255, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00186127373308409, + "epoch": 2.417614464858559, + "grad_norm": 0.24838964641094208, + "learning_rate": 1e-06, + "loss": -0.0648, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.002677409182069823, + "clip_ratio/high_mean": 0.0010236194138997234, + "clip_ratio/low_mean": 0.0010017153799708467, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002025334812060464, + "epoch": 2.41994750656168, + "grad_norm": 0.21755583584308624, + "learning_rate": 1e-06, + "loss": -0.0648, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0024117432403727435, + "clip_ratio/high_mean": 0.0010259137743560132, + "clip_ratio/low_mean": 0.0008399565867875936, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018658704066183418, + "completions/clipped_ratio": 0.1685267857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3989.0, + "completions/mean_length": 1220.51904296875, + "completions/mean_terminated_length": 637.703369140625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 2.4222805482648004, + "grad_norm": 0.39747315645217896, + "learning_rate": 1e-06, + "loss": -0.0534, + "num_tokens": 152674085.0, + "reward": 0.5100446939468384, + "reward_std": 0.2037724405527115, + "rewards/verify_math_reward/mean": 0.5100446343421936, + "rewards/verify_math_reward/std": 0.5001782774925232, + "step": 1037 + }, + { + "clip_ratio/high_max": 0.0029730666137766093, + "clip_ratio/high_mean": 0.00125672075591865, + "clip_ratio/low_mean": 0.0010882800452236552, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00234500078659039, + "epoch": 2.4246135899679206, + "grad_norm": 0.45811083912849426, + "learning_rate": 1e-06, + "loss": -0.0538, + "step": 1038 + }, + { + "clip_ratio/high_max": 0.0027848283207276836, + "clip_ratio/high_mean": 0.0011850921364384703, + "clip_ratio/low_mean": 0.0014201014419086277, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026051936220028438, + "epoch": 2.4269466316710413, + "grad_norm": 0.25683873891830444, + "learning_rate": 1e-06, + "loss": -0.0539, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.0030117457718006335, + "clip_ratio/high_mean": 0.001205594224302331, + "clip_ratio/low_mean": 0.001628739686566405, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0028343339363345876, + "epoch": 2.4292796733741615, + "grad_norm": 0.2934616208076477, + "learning_rate": 1e-06, + "loss": -0.0541, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0023620007559657097, + "clip_ratio/high_mean": 0.0009517699872958474, + "clip_ratio/low_mean": 0.0006037206144355878, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015554906567558646, + "completions/clipped_ratio": 0.1395089285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2973.0, + "completions/mean_length": 1104.765625, + "completions/mean_terminated_length": 619.805419921875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 2.431612715077282, + "grad_norm": 0.32303157448768616, + "learning_rate": 1e-06, + "loss": -0.0482, + "num_tokens": 153247235.0, + "reward": 0.5524553656578064, + "reward_std": 0.17922443151474, + "rewards/verify_math_reward/mean": 0.5524553656578064, + "rewards/verify_math_reward/std": 0.49751853942871094, + "step": 1041 + }, + { + "clip_ratio/high_max": 0.0029223363890196197, + "clip_ratio/high_mean": 0.0011394780394766713, + "clip_ratio/low_mean": 0.0009088044116651872, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020482824838836677, + "epoch": 2.4339457567804024, + "grad_norm": 0.22415462136268616, + "learning_rate": 1e-06, + "loss": -0.0485, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.002557893552875612, + "clip_ratio/high_mean": 0.0010014909148594597, + "clip_ratio/low_mean": 0.0011282869018032216, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002129777829395607, + "epoch": 2.436278798483523, + "grad_norm": 0.24331408739089966, + "learning_rate": 1e-06, + "loss": -0.0486, + "step": 1043 + }, + { + "clip_ratio/high_max": 0.0026559678371995687, + "clip_ratio/high_mean": 0.0010543445932853501, + "clip_ratio/low_mean": 0.0012289675414649537, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002283312140207272, + "epoch": 2.4386118401866432, + "grad_norm": 0.24428428709506989, + "learning_rate": 1e-06, + "loss": -0.0487, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0021358276244427543, + "clip_ratio/high_mean": 0.000681489653288736, + "clip_ratio/low_mean": 0.0005759707082688692, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001257460353372153, + "completions/clipped_ratio": 0.1584821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2308.0, + "completions/mean_length": 1167.421875, + "completions/mean_terminated_length": 615.8859252929688, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "epoch": 2.440944881889764, + "grad_norm": 0.34599801898002625, + "learning_rate": 1e-06, + "loss": -0.0598, + "num_tokens": 153800453.0, + "reward": 0.5569196939468384, + "reward_std": 0.1539819985628128, + "rewards/verify_math_reward/mean": 0.5569196343421936, + "rewards/verify_math_reward/std": 0.4970270097255707, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0024830914499034407, + "clip_ratio/high_mean": 0.0008351252122338337, + "clip_ratio/low_mean": 0.0009217281767632812, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017568534167367034, + "epoch": 2.443277923592884, + "grad_norm": 0.23306939005851746, + "learning_rate": 1e-06, + "loss": -0.0601, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.002836348914570408, + "clip_ratio/high_mean": 0.0009229661163772107, + "clip_ratio/low_mean": 0.0010264891225233441, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019494552543619648, + "epoch": 2.4456109652960047, + "grad_norm": 0.2903509736061096, + "learning_rate": 1e-06, + "loss": -0.0603, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.0025230854234905564, + "clip_ratio/high_mean": 0.0007946755479224521, + "clip_ratio/low_mean": 0.0012072050776623655, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020018806608277373, + "epoch": 2.447944006999125, + "grad_norm": 0.23268888890743256, + "learning_rate": 1e-06, + "loss": -0.0604, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0020995121230953373, + "clip_ratio/high_mean": 0.0006992659818934044, + "clip_ratio/low_mean": 0.0008514704632034409, + "clip_ratio/low_min": 1.7846945411292836e-05, + "clip_ratio/region_mean": 0.001550736455101287, + "completions/clipped_ratio": 0.1316964285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3066.0, + "completions/mean_length": 1065.997802734375, + "completions/mean_terminated_length": 606.4344482421875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 2.4502770487022456, + "grad_norm": 0.3484068810939789, + "learning_rate": 1e-06, + "loss": -0.0566, + "num_tokens": 154357067.0, + "reward": 0.5647321939468384, + "reward_std": 0.14943771064281464, + "rewards/verify_math_reward/mean": 0.5647321343421936, + "rewards/verify_math_reward/std": 0.49606895446777344, + "step": 1049 + }, + { + "clip_ratio/high_max": 0.0028557084588101134, + "clip_ratio/high_mean": 0.0009366729300381849, + "clip_ratio/low_mean": 0.0011126888975923066, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002049361872195732, + "epoch": 2.452610090405366, + "grad_norm": 0.37618282437324524, + "learning_rate": 1e-06, + "loss": -0.0569, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0027996127391816117, + "clip_ratio/high_mean": 0.0009656961301516276, + "clip_ratio/low_mean": 0.001358412255285657, + "clip_ratio/low_min": 1.5879064449109137e-05, + "clip_ratio/region_mean": 0.002324108405446168, + "epoch": 2.4549431321084865, + "grad_norm": 0.2839452922344208, + "learning_rate": 1e-06, + "loss": -0.057, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.0028057046947651543, + "clip_ratio/high_mean": 0.0009376704547321424, + "clip_ratio/low_mean": 0.001566952392749954, + "clip_ratio/low_min": 1.984126902243588e-05, + "clip_ratio/region_mean": 0.0025046228474820964, + "epoch": 2.457276173811607, + "grad_norm": 0.2504903972148895, + "learning_rate": 1e-06, + "loss": -0.0571, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0025813447864493355, + "clip_ratio/high_mean": 0.0010606997839204269, + "clip_ratio/low_mean": 0.0006247893743420718, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016854891619004775, + "completions/clipped_ratio": 0.1417410714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3553.0, + "completions/mean_length": 1123.673095703125, + "completions/mean_terminated_length": 632.7945556640625, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 2.4596092155147273, + "grad_norm": 0.30488279461860657, + "learning_rate": 1e-06, + "loss": -0.0629, + "num_tokens": 154937918.0, + "reward": 0.543526828289032, + "reward_std": 0.1814917027950287, + "rewards/verify_math_reward/mean": 0.5435267686843872, + "rewards/verify_math_reward/std": 0.49838000535964966, + "step": 1053 + }, + { + "clip_ratio/high_max": 0.0030096502196101937, + "clip_ratio/high_mean": 0.0012082152516086353, + "clip_ratio/low_mean": 0.0008828598911350127, + "clip_ratio/low_min": 1.0511267646506894e-05, + "clip_ratio/region_mean": 0.002091075155476574, + "epoch": 2.4619422572178475, + "grad_norm": 0.2740449011325836, + "learning_rate": 1e-06, + "loss": -0.0631, + "step": 1054 + }, + { + "clip_ratio/high_max": 0.0032181386195588857, + "clip_ratio/high_mean": 0.0013170854817872168, + "clip_ratio/low_mean": 0.0009968225913326023, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023139080876717344, + "epoch": 2.464275298920968, + "grad_norm": 0.2314031422138214, + "learning_rate": 1e-06, + "loss": -0.0633, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.002719180021813372, + "clip_ratio/high_mean": 0.001063241807059967, + "clip_ratio/low_mean": 0.0011379034131095978, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022011452019796707, + "epoch": 2.466608340624089, + "grad_norm": 0.2776223123073578, + "learning_rate": 1e-06, + "loss": -0.0633, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.00232488347319304, + "clip_ratio/high_mean": 0.0007725332707195776, + "clip_ratio/low_mean": 0.0004357351265298348, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001208268353366293, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3643.0, + "completions/mean_length": 1210.6551513671875, + "completions/mean_terminated_length": 611.8099365234375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 2.468941382327209, + "grad_norm": 0.33703455328941345, + "learning_rate": 1e-06, + "loss": -0.0685, + "num_tokens": 155480625.0, + "reward": 0.5625, + "reward_std": 0.13467255234718323, + "rewards/verify_math_reward/mean": 0.5625, + "rewards/verify_math_reward/std": 0.49635544419288635, + "step": 1057 + }, + { + "clip_ratio/high_max": 0.002764700235275086, + "clip_ratio/high_mean": 0.0009506015940132784, + "clip_ratio/low_mean": 0.0006605244466300064, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016111260156321805, + "epoch": 2.4712744240303297, + "grad_norm": 0.3677258789539337, + "learning_rate": 1e-06, + "loss": -0.0688, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.0029668719434994273, + "clip_ratio/high_mean": 0.0009761436376720667, + "clip_ratio/low_mean": 0.0008182404535546084, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017943840939551592, + "epoch": 2.47360746573345, + "grad_norm": 0.29488086700439453, + "learning_rate": 1e-06, + "loss": -0.0689, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.0024077127018244937, + "clip_ratio/high_mean": 0.000827830983325839, + "clip_ratio/low_mean": 0.001010290815429471, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018381218214926776, + "epoch": 2.4759405074365706, + "grad_norm": 0.22862303256988525, + "learning_rate": 1e-06, + "loss": -0.0689, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0029381492131506093, + "clip_ratio/high_mean": 0.0012637385625566822, + "clip_ratio/low_mean": 0.0007906910850579152, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020544295985018834, + "completions/clipped_ratio": 0.1629464285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3153.0, + "completions/mean_length": 1159.727783203125, + "completions/mean_terminated_length": 588.13330078125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 2.478273549139691, + "grad_norm": 0.3715926706790924, + "learning_rate": 1e-06, + "loss": -0.0747, + "num_tokens": 156005069.0, + "reward": 0.5993303656578064, + "reward_std": 0.1802881807088852, + "rewards/verify_math_reward/mean": 0.5993303656578064, + "rewards/verify_math_reward/std": 0.49030786752700806, + "step": 1061 + }, + { + "clip_ratio/high_max": 0.0034059446115861647, + "clip_ratio/high_mean": 0.0013395270325418096, + "clip_ratio/low_mean": 0.0010574301250017015, + "clip_ratio/low_min": 1.3001872503082268e-05, + "clip_ratio/region_mean": 0.002396957170276437, + "epoch": 2.4806065908428114, + "grad_norm": 0.29294928908348083, + "learning_rate": 1e-06, + "loss": -0.075, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.003073756306548603, + "clip_ratio/high_mean": 0.001338158421276603, + "clip_ratio/low_mean": 0.0012771279207299813, + "clip_ratio/low_min": 2.6003745006164536e-05, + "clip_ratio/region_mean": 0.0026152863647439517, + "epoch": 2.4829396325459316, + "grad_norm": 0.29861995577812195, + "learning_rate": 1e-06, + "loss": -0.0751, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.0038295917911455035, + "clip_ratio/high_mean": 0.0014337743923533708, + "clip_ratio/low_mean": 0.0014355370567500358, + "clip_ratio/low_min": 1.3001872503082268e-05, + "clip_ratio/region_mean": 0.0028693113999906927, + "epoch": 2.4852726742490523, + "grad_norm": 0.27737295627593994, + "learning_rate": 1e-06, + "loss": -0.0753, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.002714105990889948, + "clip_ratio/high_mean": 0.0009218911745847436, + "clip_ratio/low_mean": 0.0005902984266867861, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015121895994525403, + "completions/clipped_ratio": 0.1428571428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2535.0, + "completions/mean_length": 1139.15625, + "completions/mean_terminated_length": 646.3489990234375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 2.4876057159521725, + "grad_norm": 0.2811812162399292, + "learning_rate": 1e-06, + "loss": -0.0534, + "num_tokens": 156587673.0, + "reward": 0.5948660969734192, + "reward_std": 0.15857158601284027, + "rewards/verify_math_reward/mean": 0.5948660969734192, + "rewards/verify_math_reward/std": 0.49119213223457336, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0030347594947670586, + "clip_ratio/high_mean": 0.001111539651901694, + "clip_ratio/low_mean": 0.0007866604082664708, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018982000910909846, + "epoch": 2.489938757655293, + "grad_norm": 0.260781854391098, + "learning_rate": 1e-06, + "loss": -0.0537, + "step": 1066 + }, + { + "clip_ratio/high_max": 0.0033222405400010757, + "clip_ratio/high_mean": 0.0011177840005984763, + "clip_ratio/low_mean": 0.0009584187682776246, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020762028070748784, + "epoch": 2.4922717993584134, + "grad_norm": 0.2341073453426361, + "learning_rate": 1e-06, + "loss": -0.0539, + "step": 1067 + }, + { + "clip_ratio/high_max": 0.002542661844927352, + "clip_ratio/high_mean": 0.0010024788025475573, + "clip_ratio/low_mean": 0.0011134628221043386, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002115941599186044, + "epoch": 2.494604841061534, + "grad_norm": 0.237702876329422, + "learning_rate": 1e-06, + "loss": -0.0539, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0029128591049811803, + "clip_ratio/high_mean": 0.001122817240684526, + "clip_ratio/low_mean": 0.0003881173327044962, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015109345877135638, + "completions/clipped_ratio": 0.1964285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3621.0, + "completions/mean_length": 1375.216552734375, + "completions/mean_terminated_length": 710.1361083984375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 2.4969378827646542, + "grad_norm": 0.30831193923950195, + "learning_rate": 1e-06, + "loss": -0.0844, + "num_tokens": 157197947.0, + "reward": 0.5401785969734192, + "reward_std": 0.16841016709804535, + "rewards/verify_math_reward/mean": 0.5401785969734192, + "rewards/verify_math_reward/std": 0.49866142868995667, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.003005316306371242, + "clip_ratio/high_mean": 0.0012329331184446346, + "clip_ratio/low_mean": 0.0005590850987573504, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017920181853696704, + "epoch": 2.499270924467775, + "grad_norm": 0.29480475187301636, + "learning_rate": 1e-06, + "loss": -0.0846, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.003337475980515592, + "clip_ratio/high_mean": 0.0013035204883635743, + "clip_ratio/low_mean": 0.0007201540975074749, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020236746131558903, + "epoch": 2.5016039661708955, + "grad_norm": 0.21608605980873108, + "learning_rate": 1e-06, + "loss": -0.0848, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.0032416925678262487, + "clip_ratio/high_mean": 0.001307557919062674, + "clip_ratio/low_mean": 0.0008353501461897395, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021429080952657387, + "epoch": 2.5039370078740157, + "grad_norm": 0.2243119776248932, + "learning_rate": 1e-06, + "loss": -0.0849, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0021912348056503106, + "clip_ratio/high_mean": 0.0009612147186999209, + "clip_ratio/low_mean": 0.0006734274684276897, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00163464221986942, + "completions/clipped_ratio": 0.1517857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3212.0, + "completions/mean_length": 1191.6373291015625, + "completions/mean_terminated_length": 671.9092407226562, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 2.506270049577136, + "grad_norm": 0.32219722867012024, + "learning_rate": 1e-06, + "loss": -0.0749, + "num_tokens": 157793438.0, + "reward": 0.590401828289032, + "reward_std": 0.17484305799007416, + "rewards/verify_math_reward/mean": 0.5904017686843872, + "rewards/verify_math_reward/std": 0.49203425645828247, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.0026715986750787124, + "clip_ratio/high_mean": 0.0010693446074583335, + "clip_ratio/low_mean": 0.0008631762666482246, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001932520914124325, + "epoch": 2.5086030912802566, + "grad_norm": 0.254000723361969, + "learning_rate": 1e-06, + "loss": -0.0752, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.0026933696353808045, + "clip_ratio/high_mean": 0.00115871095113107, + "clip_ratio/low_mean": 0.001100592824514024, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022593038374907337, + "epoch": 2.5109361329833773, + "grad_norm": 0.2328997105360031, + "learning_rate": 1e-06, + "loss": -0.0753, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0028204722548252903, + "clip_ratio/high_mean": 0.0010656727863533888, + "clip_ratio/low_mean": 0.001168592214526143, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022342649899655953, + "epoch": 2.5132691746864975, + "grad_norm": 0.2636139392852783, + "learning_rate": 1e-06, + "loss": -0.0753, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0027340222732163966, + "clip_ratio/high_mean": 0.0009299906196247321, + "clip_ratio/low_mean": 0.0007416015505441464, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001671592181082815, + "completions/clipped_ratio": 0.1674107142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3079.0, + "completions/mean_length": 1231.919677734375, + "completions/mean_terminated_length": 656.0321655273438, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 2.515602216389618, + "grad_norm": 0.34658893942832947, + "learning_rate": 1e-06, + "loss": -0.0651, + "num_tokens": 158376614.0, + "reward": 0.5714285969734192, + "reward_std": 0.15890717506408691, + "rewards/verify_math_reward/mean": 0.5714285969734192, + "rewards/verify_math_reward/std": 0.49514803290367126, + "step": 1077 + }, + { + "clip_ratio/high_max": 0.002843072659743484, + "clip_ratio/high_mean": 0.0009990428261517081, + "clip_ratio/low_mean": 0.0010295792199030984, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002028622016950976, + "epoch": 2.5179352580927383, + "grad_norm": 0.2607556879520416, + "learning_rate": 1e-06, + "loss": -0.0652, + "step": 1078 + }, + { + "clip_ratio/high_max": 0.0030230559350457042, + "clip_ratio/high_mean": 0.0010666556881915312, + "clip_ratio/low_mean": 0.0012042350135743618, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002270890661748126, + "epoch": 2.520268299795859, + "grad_norm": 0.23274657130241394, + "learning_rate": 1e-06, + "loss": -0.0654, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.0026979696049238555, + "clip_ratio/high_mean": 0.0009742539659782778, + "clip_ratio/low_mean": 0.0013522894550987985, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023265434792847373, + "epoch": 2.522601341498979, + "grad_norm": 0.2244875580072403, + "learning_rate": 1e-06, + "loss": -0.0655, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0018652241342351772, + "clip_ratio/high_mean": 0.0006941004485270241, + "clip_ratio/low_mean": 0.0004151139025907469, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011092143586211023, + "completions/clipped_ratio": 0.1629464285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3628.0, + "completions/mean_length": 1252.193115234375, + "completions/mean_terminated_length": 698.5986328125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 2.5249343832021, + "grad_norm": 0.3375288248062134, + "learning_rate": 1e-06, + "loss": -0.0258, + "num_tokens": 159011563.0, + "reward": 0.520089328289032, + "reward_std": 0.1151694729924202, + "rewards/verify_math_reward/mean": 0.5200892686843872, + "rewards/verify_math_reward/std": 0.4998753070831299, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.002928855452410062, + "clip_ratio/high_mean": 0.0008989085581561085, + "clip_ratio/low_mean": 0.0005837733792759536, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014826819660811452, + "epoch": 2.52726742490522, + "grad_norm": 0.2433030605316162, + "learning_rate": 1e-06, + "loss": -0.0261, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.0029415451353997923, + "clip_ratio/high_mean": 0.0009354234507554793, + "clip_ratio/low_mean": 0.0006812501569584128, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001616673598618945, + "epoch": 2.5296004666083407, + "grad_norm": 0.20506641268730164, + "learning_rate": 1e-06, + "loss": -0.0263, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.0024541875172872096, + "clip_ratio/high_mean": 0.0008309054683195427, + "clip_ratio/low_mean": 0.0008008740928744373, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001631779559829738, + "epoch": 2.531933508311461, + "grad_norm": 0.21931932866573334, + "learning_rate": 1e-06, + "loss": -0.0262, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0022768886665289756, + "clip_ratio/high_mean": 0.000846750735945534, + "clip_ratio/low_mean": 0.0005720474728150293, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014187982269504573, + "completions/clipped_ratio": 0.1339285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3729.0, + "completions/mean_length": 1088.0703125, + "completions/mean_terminated_length": 622.926513671875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 2.5342665500145816, + "grad_norm": 0.370697945356369, + "learning_rate": 1e-06, + "loss": -0.0638, + "num_tokens": 159590042.0, + "reward": 0.5714285969734192, + "reward_std": 0.1486460566520691, + "rewards/verify_math_reward/mean": 0.5714285969734192, + "rewards/verify_math_reward/std": 0.49514803290367126, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.002662609243998304, + "clip_ratio/high_mean": 0.001032141619361937, + "clip_ratio/low_mean": 0.0008312277450386318, + "clip_ratio/low_min": 4.931276271236129e-05, + "clip_ratio/region_mean": 0.0018633693834999576, + "epoch": 2.536599591717702, + "grad_norm": 0.29990503191947937, + "learning_rate": 1e-06, + "loss": -0.064, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.0032535009559069294, + "clip_ratio/high_mean": 0.0010791864497150527, + "clip_ratio/low_mean": 0.0009813344277063152, + "clip_ratio/low_min": 4.732724482892081e-05, + "clip_ratio/region_mean": 0.0020605208810593467, + "epoch": 2.5389326334208224, + "grad_norm": 0.23016409575939178, + "learning_rate": 1e-06, + "loss": -0.0642, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.003006082901265472, + "clip_ratio/high_mean": 0.0011064850477850996, + "clip_ratio/low_mean": 0.001108902984924498, + "clip_ratio/low_min": 1.1548410839168355e-05, + "clip_ratio/region_mean": 0.0022153881000122055, + "epoch": 2.5412656751239426, + "grad_norm": 0.2070586383342743, + "learning_rate": 1e-06, + "loss": -0.0643, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0025503474244032986, + "clip_ratio/high_mean": 0.0011560129387362394, + "clip_ratio/low_mean": 0.0007998784167284612, + "clip_ratio/low_min": 2.5105442546191625e-05, + "clip_ratio/region_mean": 0.0019558913918444887, + "completions/clipped_ratio": 0.1372767857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2836.0, + "completions/mean_length": 1123.438720703125, + "completions/mean_terminated_length": 650.4437255859375, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 2.5435987168270633, + "grad_norm": 0.3656218349933624, + "learning_rate": 1e-06, + "loss": -0.0502, + "num_tokens": 160174971.0, + "reward": 0.5535714626312256, + "reward_std": 0.197794571518898, + "rewards/verify_math_reward/mean": 0.5535714030265808, + "rewards/verify_math_reward/std": 0.4973995089530945, + "step": 1089 + }, + { + "clip_ratio/high_max": 0.002961598554975353, + "clip_ratio/high_mean": 0.001348602381767705, + "clip_ratio/low_mean": 0.0010555532207945362, + "clip_ratio/low_min": 2.5105442546191625e-05, + "clip_ratio/region_mean": 0.0024041556171141565, + "epoch": 2.545931758530184, + "grad_norm": 0.3478671908378601, + "learning_rate": 1e-06, + "loss": -0.0506, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0028374877947499044, + "clip_ratio/high_mean": 0.0013154700263839914, + "clip_ratio/low_mean": 0.0012582149520312669, + "clip_ratio/low_min": 2.5105442546191625e-05, + "clip_ratio/region_mean": 0.0025736849784152582, + "epoch": 2.548264800233304, + "grad_norm": 0.2995145618915558, + "learning_rate": 1e-06, + "loss": -0.0508, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.0030797913859714754, + "clip_ratio/high_mean": 0.0012829393544961931, + "clip_ratio/low_mean": 0.0015215690691547934, + "clip_ratio/low_min": 1.4302059753390495e-05, + "clip_ratio/region_mean": 0.0028045084109180607, + "epoch": 2.5505978419364244, + "grad_norm": 0.33942118287086487, + "learning_rate": 1e-06, + "loss": -0.051, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0027154575218446553, + "clip_ratio/high_mean": 0.0009403135609318269, + "clip_ratio/low_mean": 0.0004383714103823877, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013786849558528047, + "completions/clipped_ratio": 0.1774553571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3796.0, + "completions/mean_length": 1330.404052734375, + "completions/mean_terminated_length": 733.7557983398438, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 2.552930883639545, + "grad_norm": 0.3024427592754364, + "learning_rate": 1e-06, + "loss": -0.0898, + "num_tokens": 160800557.0, + "reward": 0.53125, + "reward_std": 0.1597309112548828, + "rewards/verify_math_reward/mean": 0.53125, + "rewards/verify_math_reward/std": 0.4993011951446533, + "step": 1093 + }, + { + "clip_ratio/high_max": 0.003726006718352437, + "clip_ratio/high_mean": 0.0012743875086016487, + "clip_ratio/low_mean": 0.0005434095000964589, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018177969686803408, + "epoch": 2.5552639253426657, + "grad_norm": 0.25026634335517883, + "learning_rate": 1e-06, + "loss": -0.0899, + "step": 1094 + }, + { + "clip_ratio/high_max": 0.0036933976589352824, + "clip_ratio/high_mean": 0.0012457342709240038, + "clip_ratio/low_mean": 0.0006731112216584734, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019188454753020778, + "epoch": 2.557596967045786, + "grad_norm": 0.23310990631580353, + "learning_rate": 1e-06, + "loss": -0.0901, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.003475955832982436, + "clip_ratio/high_mean": 0.0011531089658092242, + "clip_ratio/low_mean": 0.0007925101235741749, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019456190675555263, + "epoch": 2.5599300087489065, + "grad_norm": 0.24037988483905792, + "learning_rate": 1e-06, + "loss": -0.0902, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0025270912956329994, + "clip_ratio/high_mean": 0.0011240604162594536, + "clip_ratio/low_mean": 0.0009068060808203882, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020308665043557994, + "completions/clipped_ratio": 0.1774553571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3769.0, + "completions/mean_length": 1283.2578125, + "completions/mean_terminated_length": 676.4382934570312, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 2.5622630504520267, + "grad_norm": 0.35510119795799255, + "learning_rate": 1e-06, + "loss": -0.0802, + "num_tokens": 161386172.0, + "reward": 0.5178571939468384, + "reward_std": 0.18468938767910004, + "rewards/verify_math_reward/mean": 0.5178571343421936, + "rewards/verify_math_reward/std": 0.4999600946903229, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.003006246748554986, + "clip_ratio/high_mean": 0.0013421027542790398, + "clip_ratio/low_mean": 0.0010785836548166117, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002420686447294429, + "epoch": 2.5645960921551474, + "grad_norm": 0.37266677618026733, + "learning_rate": 1e-06, + "loss": -0.0804, + "step": 1098 + }, + { + "clip_ratio/high_max": 0.002982464611704927, + "clip_ratio/high_mean": 0.0012667034025071189, + "clip_ratio/low_mean": 0.0013548635179176927, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002621566949528642, + "epoch": 2.5669291338582676, + "grad_norm": 0.23731666803359985, + "learning_rate": 1e-06, + "loss": -0.0806, + "step": 1099 + }, + { + "clip_ratio/high_max": 0.00288599579653237, + "clip_ratio/high_mean": 0.0012670163796428824, + "clip_ratio/low_mean": 0.001627523721253965, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002894540004490409, + "epoch": 2.5692621755613883, + "grad_norm": 0.2168024182319641, + "learning_rate": 1e-06, + "loss": -0.0808, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.002326938461919781, + "clip_ratio/high_mean": 0.0009346261258542654, + "clip_ratio/low_mean": 0.0005173175809431996, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014519437027047388, + "completions/clipped_ratio": 0.1272321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2913.0, + "completions/mean_length": 1109.4710693359375, + "completions/mean_terminated_length": 674.0946044921875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 2.5715952172645085, + "grad_norm": 0.3829411268234253, + "learning_rate": 1e-06, + "loss": -0.0594, + "num_tokens": 162010586.0, + "reward": 0.6383928656578064, + "reward_std": 0.1537972390651703, + "rewards/verify_math_reward/mean": 0.6383928656578064, + "rewards/verify_math_reward/std": 0.4807341694831848, + "step": 1101 + }, + { + "clip_ratio/high_max": 0.0024993760089273565, + "clip_ratio/high_mean": 0.0010787901337607764, + "clip_ratio/low_mean": 0.0007505401317757787, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018293302418896928, + "epoch": 2.573928258967629, + "grad_norm": 0.2564522325992584, + "learning_rate": 1e-06, + "loss": -0.0596, + "step": 1102 + }, + { + "clip_ratio/high_max": 0.0030906805586710107, + "clip_ratio/high_mean": 0.0011437192715675337, + "clip_ratio/low_mean": 0.0009046731920534512, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020483924672589637, + "epoch": 2.5762613006707493, + "grad_norm": 0.20611624419689178, + "learning_rate": 1e-06, + "loss": -0.0598, + "step": 1103 + }, + { + "clip_ratio/high_max": 0.0025507345417281613, + "clip_ratio/high_mean": 0.0010343282538087806, + "clip_ratio/low_mean": 0.0010219057076028548, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020562340068863705, + "epoch": 2.57859434237387, + "grad_norm": 0.30897054076194763, + "learning_rate": 1e-06, + "loss": -0.0599, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0027421125705586746, + "clip_ratio/high_mean": 0.0010197161354881246, + "clip_ratio/low_mean": 0.0004768837093251932, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001496599848906044, + "completions/clipped_ratio": 0.1305803571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3928.0, + "completions/mean_length": 1054.1317138671875, + "completions/mean_terminated_length": 597.2657470703125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 2.5809273840769906, + "grad_norm": 0.32677143812179565, + "learning_rate": 1e-06, + "loss": -0.0551, + "num_tokens": 162562792.0, + "reward": 0.5959821939468384, + "reward_std": 0.15357083082199097, + "rewards/verify_math_reward/mean": 0.5959821343421936, + "rewards/verify_math_reward/std": 0.490975022315979, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0025771559012355283, + "clip_ratio/high_mean": 0.0010178010234085377, + "clip_ratio/low_mean": 0.000736704959308554, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017545059599797241, + "epoch": 2.583260425780111, + "grad_norm": 0.27515554428100586, + "learning_rate": 1e-06, + "loss": -0.0553, + "step": 1106 + }, + { + "clip_ratio/high_max": 0.0029067202340229414, + "clip_ratio/high_mean": 0.0011057470001105685, + "clip_ratio/low_mean": 0.000855755053635221, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001961502101039514, + "epoch": 2.585593467483231, + "grad_norm": 0.22235415875911713, + "learning_rate": 1e-06, + "loss": -0.0555, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.0027928195049753413, + "clip_ratio/high_mean": 0.0010616704385029152, + "clip_ratio/low_mean": 0.0010224524312434369, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002084122876112815, + "epoch": 2.5879265091863517, + "grad_norm": 0.27045929431915283, + "learning_rate": 1e-06, + "loss": -0.0555, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.002150590888049919, + "clip_ratio/high_mean": 0.0010427422366774408, + "clip_ratio/low_mean": 0.000490098445879994, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015328407025663182, + "completions/clipped_ratio": 0.1261160714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3760.0, + "completions/mean_length": 1091.25, + "completions/mean_terminated_length": 657.6142578125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "epoch": 2.5902595508894724, + "grad_norm": 0.4157736897468567, + "learning_rate": 1e-06, + "loss": -0.06, + "num_tokens": 163170296.0, + "reward": 0.5647321939468384, + "reward_std": 0.16578474640846252, + "rewards/verify_math_reward/mean": 0.5647321343421936, + "rewards/verify_math_reward/std": 0.49606895446777344, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.0026978680325555615, + "clip_ratio/high_mean": 0.0011257234727963805, + "clip_ratio/low_mean": 0.0007382088842859957, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018639323971001431, + "epoch": 2.5925925925925926, + "grad_norm": 0.25867852568626404, + "learning_rate": 1e-06, + "loss": -0.0603, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0028489320684457198, + "clip_ratio/high_mean": 0.0012609830337169115, + "clip_ratio/low_mean": 0.0008413531631958904, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002102336278767325, + "epoch": 2.5949256342957128, + "grad_norm": 0.25433149933815, + "learning_rate": 1e-06, + "loss": -0.0605, + "step": 1111 + }, + { + "clip_ratio/high_max": 0.0024162857080227695, + "clip_ratio/high_mean": 0.0010507349834369961, + "clip_ratio/low_mean": 0.0009205314836435718, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019712664870894514, + "epoch": 2.5972586759988334, + "grad_norm": 0.2389124184846878, + "learning_rate": 1e-06, + "loss": -0.0605, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.00232755062461365, + "clip_ratio/high_mean": 0.0009195824804919539, + "clip_ratio/low_mean": 0.0005153276651981287, + "clip_ratio/low_min": 1.2817883543903008e-05, + "clip_ratio/region_mean": 0.001434910129319178, + "completions/clipped_ratio": 0.1495535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3426.0, + "completions/mean_length": 1138.8426513671875, + "completions/mean_terminated_length": 618.8175659179688, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 2.599591717701954, + "grad_norm": 0.33814549446105957, + "learning_rate": 1e-06, + "loss": -0.0936, + "num_tokens": 163728859.0, + "reward": 0.6272321939468384, + "reward_std": 0.1795371025800705, + "rewards/verify_math_reward/mean": 0.6272321343421936, + "rewards/verify_math_reward/std": 0.4838111698627472, + "step": 1113 + }, + { + "clip_ratio/high_max": 0.0033839424868347123, + "clip_ratio/high_mean": 0.0012554665518109687, + "clip_ratio/low_mean": 0.0007289757140824804, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00198444229317829, + "epoch": 2.6019247594050743, + "grad_norm": 0.25683408975601196, + "learning_rate": 1e-06, + "loss": -0.0939, + "step": 1114 + }, + { + "clip_ratio/high_max": 0.002822003727487754, + "clip_ratio/high_mean": 0.001149767565948423, + "clip_ratio/low_mean": 0.000779302723458386, + "clip_ratio/low_min": 1.2332280675764196e-05, + "clip_ratio/region_mean": 0.0019290702839498408, + "epoch": 2.604257801108195, + "grad_norm": 0.26113370060920715, + "learning_rate": 1e-06, + "loss": -0.094, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.002664569779881276, + "clip_ratio/high_mean": 0.0010907177729677642, + "clip_ratio/low_mean": 0.0010436815664434107, + "clip_ratio/low_min": 2.466456135152839e-05, + "clip_ratio/region_mean": 0.002134399299393408, + "epoch": 2.606590842811315, + "grad_norm": 0.21102070808410645, + "learning_rate": 1e-06, + "loss": -0.0941, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0020429506421351107, + "clip_ratio/high_mean": 0.0006444957043640898, + "clip_ratio/low_mean": 0.000556617698066475, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012011134040221805, + "completions/clipped_ratio": 0.1216517857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3814.0, + "completions/mean_length": 1052.1273193359375, + "completions/mean_terminated_length": 630.5488891601562, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 2.608923884514436, + "grad_norm": 0.3030131161212921, + "learning_rate": 1e-06, + "loss": -0.0359, + "num_tokens": 164309485.0, + "reward": 0.5524553656578064, + "reward_std": 0.13876289129257202, + "rewards/verify_math_reward/mean": 0.5524553656578064, + "rewards/verify_math_reward/std": 0.49751853942871094, + "step": 1117 + }, + { + "clip_ratio/high_max": 0.0025315484544989886, + "clip_ratio/high_mean": 0.0008370420373466914, + "clip_ratio/low_mean": 0.0007493260559385817, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001586368085554568, + "epoch": 2.611256926217556, + "grad_norm": 0.25326409935951233, + "learning_rate": 1e-06, + "loss": -0.036, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.0025373918942932505, + "clip_ratio/high_mean": 0.00083686521384152, + "clip_ratio/low_mean": 0.001015362488487881, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018522277350712102, + "epoch": 2.6135899679206767, + "grad_norm": 0.2460647076368332, + "learning_rate": 1e-06, + "loss": -0.0362, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.002477307054505218, + "clip_ratio/high_mean": 0.0008151182400979451, + "clip_ratio/low_mean": 0.0010352167614655627, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018503350001992658, + "epoch": 2.615923009623797, + "grad_norm": 0.24024319648742676, + "learning_rate": 1e-06, + "loss": -0.0363, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0021482690281118266, + "clip_ratio/high_mean": 0.0008689304322615499, + "clip_ratio/low_mean": 0.0005232768307905644, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013922072866989765, + "completions/clipped_ratio": 0.1127232142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3888.0, + "completions/mean_length": 996.0848388671875, + "completions/mean_terminated_length": 602.2590942382812, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 2.6182560513269175, + "grad_norm": 0.2719813585281372, + "learning_rate": 1e-06, + "loss": -0.0492, + "num_tokens": 164869201.0, + "reward": 0.660714328289032, + "reward_std": 0.15390713512897491, + "rewards/verify_math_reward/mean": 0.6607142686843872, + "rewards/verify_math_reward/std": 0.4737313687801361, + "step": 1121 + }, + { + "clip_ratio/high_max": 0.002664324529177975, + "clip_ratio/high_mean": 0.0010516172642383026, + "clip_ratio/low_mean": 0.0006718581489622011, + "clip_ratio/low_min": 1.4859724615234882e-05, + "clip_ratio/region_mean": 0.001723475430480903, + "epoch": 2.6205890930300377, + "grad_norm": 0.3058817982673645, + "learning_rate": 1e-06, + "loss": -0.0493, + "step": 1122 + }, + { + "clip_ratio/high_max": 0.0028086251113563776, + "clip_ratio/high_mean": 0.001154651603428647, + "clip_ratio/low_mean": 0.0008613234749645926, + "clip_ratio/low_min": 2.2080905182519928e-05, + "clip_ratio/region_mean": 0.002015975085669197, + "epoch": 2.6229221347331584, + "grad_norm": 0.19979745149612427, + "learning_rate": 1e-06, + "loss": -0.0496, + "step": 1123 + }, + { + "clip_ratio/high_max": 0.0024760094711382408, + "clip_ratio/high_mean": 0.0009517693533780403, + "clip_ratio/low_mean": 0.00111147116876964, + "clip_ratio/low_min": 1.1040452591259964e-05, + "clip_ratio/region_mean": 0.0020632405357901007, + "epoch": 2.625255176436279, + "grad_norm": 0.2403215914964676, + "learning_rate": 1e-06, + "loss": -0.0496, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0027720848738681525, + "clip_ratio/high_mean": 0.0009584093750163447, + "clip_ratio/low_mean": 0.000537255859853758, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014956652375985868, + "completions/clipped_ratio": 0.1160714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3061.0, + "completions/mean_length": 1032.552490234375, + "completions/mean_terminated_length": 630.2815551757812, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 2.6275882181393992, + "grad_norm": 0.2966957688331604, + "learning_rate": 1e-06, + "loss": -0.061, + "num_tokens": 165453048.0, + "reward": 0.6517857313156128, + "reward_std": 0.17476709187030792, + "rewards/verify_math_reward/mean": 0.6517857313156128, + "rewards/verify_math_reward/std": 0.47667041420936584, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.00300396719831042, + "clip_ratio/high_mean": 0.001154687659436604, + "clip_ratio/low_mean": 0.0008199599869840313, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001974647617316805, + "epoch": 2.6299212598425195, + "grad_norm": 0.25790533423423767, + "learning_rate": 1e-06, + "loss": -0.0612, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.0032114461428136565, + "clip_ratio/high_mean": 0.0012655668942898046, + "clip_ratio/low_mean": 0.0009432689948880579, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022088358527980745, + "epoch": 2.63225430154564, + "grad_norm": 0.21397745609283447, + "learning_rate": 1e-06, + "loss": -0.0614, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.0031452557304874063, + "clip_ratio/high_mean": 0.001191447696328396, + "clip_ratio/low_mean": 0.001080630059732357, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022720778069924563, + "epoch": 2.6345873432487608, + "grad_norm": 0.25344526767730713, + "learning_rate": 1e-06, + "loss": -0.0614, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0022820700614829548, + "clip_ratio/high_mean": 0.0009935147118085297, + "clip_ratio/low_mean": 0.0007081885323714232, + "clip_ratio/low_min": 4.7674659072072245e-05, + "clip_ratio/region_mean": 0.0017017032078001648, + "completions/clipped_ratio": 0.1506696428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3312.0, + "completions/mean_length": 1139.97216796875, + "completions/mean_terminated_length": 615.5781860351562, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 2.636920384951881, + "grad_norm": 0.32561057806015015, + "learning_rate": 1e-06, + "loss": -0.0586, + "num_tokens": 166002015.0, + "reward": 0.5837053656578064, + "reward_std": 0.1879894882440567, + "rewards/verify_math_reward/mean": 0.5837053656578064, + "rewards/verify_math_reward/std": 0.49321895837783813, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.002762396055913996, + "clip_ratio/high_mean": 0.001226898548338795, + "clip_ratio/low_mean": 0.000930823360249633, + "clip_ratio/low_min": 7.519316386606079e-05, + "clip_ratio/region_mean": 0.0021577219049504492, + "epoch": 2.6392534266550016, + "grad_norm": 0.25686073303222656, + "learning_rate": 1e-06, + "loss": -0.0589, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0028286995220696554, + "clip_ratio/high_mean": 0.0012117004789615748, + "clip_ratio/low_mean": 0.0010972727650369052, + "clip_ratio/low_min": 3.666993507067673e-05, + "clip_ratio/region_mean": 0.002308973234903533, + "epoch": 2.641586468358122, + "grad_norm": 0.2529727518558502, + "learning_rate": 1e-06, + "loss": -0.0591, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.0026503606131882407, + "clip_ratio/high_mean": 0.0011134245469293091, + "clip_ratio/low_mean": 0.0013780929548374843, + "clip_ratio/low_min": 9.493100515101105e-05, + "clip_ratio/region_mean": 0.0024915175017667934, + "epoch": 2.6439195100612425, + "grad_norm": 0.25941675901412964, + "learning_rate": 1e-06, + "loss": -0.0592, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0023725707796984352, + "clip_ratio/high_mean": 0.0009655496851337375, + "clip_ratio/low_mean": 0.0005347466503735632, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015002963627921417, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3983.0, + "completions/mean_length": 982.0357666015625, + "completions/mean_terminated_length": 599.6190185546875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 2.6462525517643627, + "grad_norm": 0.2986929714679718, + "learning_rate": 1e-06, + "loss": -0.0672, + "num_tokens": 166575767.0, + "reward": 0.6194196939468384, + "reward_std": 0.189485564827919, + "rewards/verify_math_reward/mean": 0.6194196343421936, + "rewards/verify_math_reward/std": 0.48580074310302734, + "step": 1133 + }, + { + "clip_ratio/high_max": 0.0027116593992104754, + "clip_ratio/high_mean": 0.0011672088367049582, + "clip_ratio/low_mean": 0.0007560647736681858, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001923273601278197, + "epoch": 2.6485855934674833, + "grad_norm": 0.272161066532135, + "learning_rate": 1e-06, + "loss": -0.0675, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.002445779689878691, + "clip_ratio/high_mean": 0.0011025048988813069, + "clip_ratio/low_mean": 0.0009129698701144662, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002015474768995773, + "epoch": 2.6509186351706036, + "grad_norm": 0.204036146402359, + "learning_rate": 1e-06, + "loss": -0.0676, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0024223813525168225, + "clip_ratio/high_mean": 0.0011369093772373162, + "clip_ratio/low_mean": 0.0010873437604459468, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022242531340452842, + "epoch": 2.653251676873724, + "grad_norm": 0.2310946136713028, + "learning_rate": 1e-06, + "loss": -0.0677, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0019093765258730855, + "clip_ratio/high_mean": 0.0007528779151471099, + "clip_ratio/low_mean": 0.0005131302204972599, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00126600810472155, + "completions/clipped_ratio": 0.1573660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4006.0, + "completions/mean_length": 1196.7523193359375, + "completions/mean_terminated_length": 655.3033447265625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "epoch": 2.6555847185768444, + "grad_norm": 0.2802563011646271, + "learning_rate": 1e-06, + "loss": -0.056, + "num_tokens": 167160089.0, + "reward": 0.5691964626312256, + "reward_std": 0.15916569530963898, + "rewards/verify_math_reward/mean": 0.5691964030265808, + "rewards/verify_math_reward/std": 0.4954652488231659, + "step": 1137 + }, + { + "clip_ratio/high_max": 0.0025058464889298193, + "clip_ratio/high_mean": 0.0009987526918848744, + "clip_ratio/low_mean": 0.0006417501290343353, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016405027927248739, + "epoch": 2.657917760279965, + "grad_norm": 0.23167243599891663, + "learning_rate": 1e-06, + "loss": -0.0562, + "step": 1138 + }, + { + "clip_ratio/high_max": 0.002632463161717169, + "clip_ratio/high_mean": 0.000962874670221936, + "clip_ratio/low_mean": 0.0008900784996512812, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018529531444073655, + "epoch": 2.6602508019830857, + "grad_norm": 0.32529324293136597, + "learning_rate": 1e-06, + "loss": -0.0564, + "step": 1139 + }, + { + "clip_ratio/high_max": 0.0023656452540308237, + "clip_ratio/high_mean": 0.0009692981020634761, + "clip_ratio/low_mean": 0.0009637194434617413, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019330175855429843, + "epoch": 2.662583843686206, + "grad_norm": 0.27710026502609253, + "learning_rate": 1e-06, + "loss": -0.0564, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0022393335384549573, + "clip_ratio/high_mean": 0.0007632964770891704, + "clip_ratio/low_mean": 0.0005693107414117549, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013326072366908193, + "completions/clipped_ratio": 0.1841517857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3621.0, + "completions/mean_length": 1360.2723388671875, + "completions/mean_terminated_length": 742.7688598632812, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 2.664916885389326, + "grad_norm": 0.2509084641933441, + "learning_rate": 1e-06, + "loss": -0.0584, + "num_tokens": 167792013.0, + "reward": 0.463169664144516, + "reward_std": 0.16214200854301453, + "rewards/verify_math_reward/mean": 0.4631696343421936, + "rewards/verify_math_reward/std": 0.49892017245292664, + "step": 1141 + }, + { + "clip_ratio/high_max": 0.0027459394914330915, + "clip_ratio/high_mean": 0.0010035582890850492, + "clip_ratio/low_mean": 0.0008170855089701945, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018206437562184874, + "epoch": 2.667249927092447, + "grad_norm": 0.22688522934913635, + "learning_rate": 1e-06, + "loss": -0.0586, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.0028888552769785747, + "clip_ratio/high_mean": 0.0009991747792810202, + "clip_ratio/low_mean": 0.0009259738062610268, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019251485900895204, + "epoch": 2.6695829687955674, + "grad_norm": 0.22150474786758423, + "learning_rate": 1e-06, + "loss": -0.0587, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.002439021052850876, + "clip_ratio/high_mean": 0.0009087577436730498, + "clip_ratio/low_mean": 0.0010164745017391397, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019252322454121895, + "epoch": 2.6719160104986877, + "grad_norm": 0.20555339753627777, + "learning_rate": 1e-06, + "loss": -0.0588, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0018441054489812814, + "clip_ratio/high_mean": 0.0007074962049955502, + "clip_ratio/low_mean": 0.0005324415769791813, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001239937777427258, + "completions/clipped_ratio": 0.1428571428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4079.0, + "completions/mean_length": 1170.54248046875, + "completions/mean_terminated_length": 682.9661865234375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 2.674249052201808, + "grad_norm": 0.2705370783805847, + "learning_rate": 1e-06, + "loss": -0.0643, + "num_tokens": 168411147.0, + "reward": 0.5089285969734192, + "reward_std": 0.14042216539382935, + "rewards/verify_math_reward/mean": 0.5089285969734192, + "rewards/verify_math_reward/std": 0.5001994967460632, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.002201586488808971, + "clip_ratio/high_mean": 0.0008728033153602155, + "clip_ratio/low_mean": 0.0007337162696785526, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016065196141425986, + "epoch": 2.6765820939049285, + "grad_norm": 0.24536322057247162, + "learning_rate": 1e-06, + "loss": -0.0645, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.002139316660759505, + "clip_ratio/high_mean": 0.0007986662731127581, + "clip_ratio/low_mean": 0.0007943360560602741, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015930023946566507, + "epoch": 2.678915135608049, + "grad_norm": 0.24811631441116333, + "learning_rate": 1e-06, + "loss": -0.0646, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.0023903483706817497, + "clip_ratio/high_mean": 0.0008260114354925463, + "clip_ratio/low_mean": 0.0009675408273324138, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017935522046172991, + "epoch": 2.6812481773111694, + "grad_norm": 0.2024756819009781, + "learning_rate": 1e-06, + "loss": -0.0647, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0022558805030712392, + "clip_ratio/high_mean": 0.0008015481953407289, + "clip_ratio/low_mean": 0.0005789182469015941, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001380466444970807, + "completions/clipped_ratio": 0.1439732142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3994.0, + "completions/mean_length": 1187.8560791015625, + "completions/mean_terminated_length": 698.7418212890625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 2.68358121901429, + "grad_norm": 0.28669261932373047, + "learning_rate": 1e-06, + "loss": -0.0455, + "num_tokens": 169031930.0, + "reward": 0.5457589626312256, + "reward_std": 0.15522870421409607, + "rewards/verify_math_reward/mean": 0.5457589030265808, + "rewards/verify_math_reward/std": 0.4981797933578491, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.0021396095071395393, + "clip_ratio/high_mean": 0.0008520158498868113, + "clip_ratio/low_mean": 0.0006523220808958285, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015043379098642617, + "epoch": 2.6859142607174102, + "grad_norm": 0.2215685099363327, + "learning_rate": 1e-06, + "loss": -0.0455, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0026519415332586505, + "clip_ratio/high_mean": 0.0009802410786505789, + "clip_ratio/low_mean": 0.000823186988782254, + "clip_ratio/low_min": 1.5277439160854556e-05, + "clip_ratio/region_mean": 0.001803428036510013, + "epoch": 2.688247302420531, + "grad_norm": 0.24253995716571808, + "learning_rate": 1e-06, + "loss": -0.0457, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.002195755841967184, + "clip_ratio/high_mean": 0.0008404661348322406, + "clip_ratio/low_mean": 0.0009513325512671145, + "clip_ratio/low_min": 4.391512266010977e-05, + "clip_ratio/region_mean": 0.001791798677004408, + "epoch": 2.690580344123651, + "grad_norm": 0.20786729454994202, + "learning_rate": 1e-06, + "loss": -0.0458, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.002137276365829166, + "clip_ratio/high_mean": 0.0008418489796895301, + "clip_ratio/low_mean": 0.00039431780010090733, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012361667722871061, + "completions/clipped_ratio": 0.1462053571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3274.0, + "completions/mean_length": 1187.6429443359375, + "completions/mean_terminated_length": 689.6104736328125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 2.6929133858267718, + "grad_norm": 0.26343563199043274, + "learning_rate": 1e-06, + "loss": -0.0643, + "num_tokens": 169643770.0, + "reward": 0.5703125, + "reward_std": 0.1406080424785614, + "rewards/verify_math_reward/mean": 0.5703125, + "rewards/verify_math_reward/std": 0.49530795216560364, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.0022794629840063863, + "clip_ratio/high_mean": 0.0009593052564014215, + "clip_ratio/low_mean": 0.0005149428202457784, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014742480670975056, + "epoch": 2.695246427529892, + "grad_norm": 0.21706290543079376, + "learning_rate": 1e-06, + "loss": -0.0644, + "step": 1154 + }, + { + "clip_ratio/high_max": 0.0026719845554907806, + "clip_ratio/high_mean": 0.00097825049306266, + "clip_ratio/low_mean": 0.0005811857872686232, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015594363103446085, + "epoch": 2.6975794692330126, + "grad_norm": 0.45889315009117126, + "learning_rate": 1e-06, + "loss": -0.0645, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0024308937645400874, + "clip_ratio/high_mean": 0.0009150312143901829, + "clip_ratio/low_mean": 0.0007390931596091832, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016541243676329032, + "epoch": 2.699912510936133, + "grad_norm": 0.20172478258609772, + "learning_rate": 1e-06, + "loss": -0.0646, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.001911978415591875, + "clip_ratio/high_mean": 0.000815377288745367, + "clip_ratio/low_mean": 0.0006175725966386381, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014329498771985527, + "completions/clipped_ratio": 0.1361607142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3168.0, + "completions/mean_length": 1103.0257568359375, + "completions/mean_terminated_length": 631.264892578125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 2.7022455526392535, + "grad_norm": 0.3399980962276459, + "learning_rate": 1e-06, + "loss": -0.0641, + "num_tokens": 170219817.0, + "reward": 0.6183035969734192, + "reward_std": 0.15819111466407776, + "rewards/verify_math_reward/mean": 0.6183035969734192, + "rewards/verify_math_reward/std": 0.4860740303993225, + "step": 1157 + }, + { + "clip_ratio/high_max": 0.002753663487965241, + "clip_ratio/high_mean": 0.0010733545605035033, + "clip_ratio/low_mean": 0.000898408230568748, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019717628456419334, + "epoch": 2.704578594342374, + "grad_norm": 0.285305917263031, + "learning_rate": 1e-06, + "loss": -0.0643, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.002390883913903963, + "clip_ratio/high_mean": 0.0009104770833801012, + "clip_ratio/low_mean": 0.0010506581602385268, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001961135247256607, + "epoch": 2.7069116360454943, + "grad_norm": 0.2516597807407379, + "learning_rate": 1e-06, + "loss": -0.0644, + "step": 1159 + }, + { + "clip_ratio/high_max": 0.0023764417856000364, + "clip_ratio/high_mean": 0.0008862508093443466, + "clip_ratio/low_mean": 0.0012949681567988591, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002181219002522994, + "epoch": 2.7092446777486145, + "grad_norm": 0.2154473513364792, + "learning_rate": 1e-06, + "loss": -0.0646, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0021184427678235807, + "clip_ratio/high_mean": 0.0007785532925481675, + "clip_ratio/low_mean": 0.0004812925199075835, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012598458342836238, + "completions/clipped_ratio": 0.0970982142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2231.0, + "completions/mean_length": 987.177490234375, + "completions/mean_terminated_length": 652.8541259765625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "epoch": 2.711577719451735, + "grad_norm": 0.2807345390319824, + "learning_rate": 1e-06, + "loss": -0.0352, + "num_tokens": 170838272.0, + "reward": 0.5803571939468384, + "reward_std": 0.14004239439964294, + "rewards/verify_math_reward/mean": 0.5803571343421936, + "rewards/verify_math_reward/std": 0.4937761425971985, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.0022629649320151657, + "clip_ratio/high_mean": 0.0008516832112945849, + "clip_ratio/low_mean": 0.0006314157672022702, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014830990257905796, + "epoch": 2.713910761154856, + "grad_norm": 0.2561696469783783, + "learning_rate": 1e-06, + "loss": -0.0353, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.002565336508268956, + "clip_ratio/high_mean": 0.0009492197441431927, + "clip_ratio/low_mean": 0.0007205124074971536, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016697321334504522, + "epoch": 2.716243802857976, + "grad_norm": 0.202647864818573, + "learning_rate": 1e-06, + "loss": -0.0354, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.002372305709286593, + "clip_ratio/high_mean": 0.0009050377866515191, + "clip_ratio/low_mean": 0.0009203308145515621, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018253686212119646, + "epoch": 2.7185768445610963, + "grad_norm": 0.1613180786371231, + "learning_rate": 1e-06, + "loss": -0.0355, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.002923695297795348, + "clip_ratio/high_mean": 0.0010887382250075461, + "clip_ratio/low_mean": 0.0005822753610118525, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001671013578743441, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3938.0, + "completions/mean_length": 1305.7154541015625, + "completions/mean_terminated_length": 661.8035888671875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 2.720909886264217, + "grad_norm": 0.34382542967796326, + "learning_rate": 1e-06, + "loss": -0.0553, + "num_tokens": 171422433.0, + "reward": 0.5245535969734192, + "reward_std": 0.1658179610967636, + "rewards/verify_math_reward/mean": 0.5245535969734192, + "rewards/verify_math_reward/std": 0.4996756613254547, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.00350352305395063, + "clip_ratio/high_mean": 0.0011558006335690152, + "clip_ratio/low_mean": 0.00077688737110293, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019326879591972101, + "epoch": 2.7232429279673376, + "grad_norm": 0.29401642084121704, + "learning_rate": 1e-06, + "loss": -0.0555, + "step": 1166 + }, + { + "clip_ratio/high_max": 0.0030248135080910288, + "clip_ratio/high_mean": 0.0012315093026700197, + "clip_ratio/low_mean": 0.0010095744673890295, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002241083755507134, + "epoch": 2.725575969670458, + "grad_norm": 0.2268916368484497, + "learning_rate": 1e-06, + "loss": -0.0557, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.003271043417043984, + "clip_ratio/high_mean": 0.0011021570935554337, + "clip_ratio/low_mean": 0.0012381917695165612, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023403488739859313, + "epoch": 2.7279090113735784, + "grad_norm": 0.2651258111000061, + "learning_rate": 1e-06, + "loss": -0.0558, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0018323434414924122, + "clip_ratio/high_mean": 0.0007325293281610357, + "clip_ratio/low_mean": 0.0006358508017001441, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001368380144413095, + "completions/clipped_ratio": 0.1484375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3717.0, + "completions/mean_length": 1216.068115234375, + "completions/mean_terminated_length": 714.0616455078125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 2.7302420530766986, + "grad_norm": 0.2982531189918518, + "learning_rate": 1e-06, + "loss": -0.0591, + "num_tokens": 172058222.0, + "reward": 0.5111607313156128, + "reward_std": 0.18160229921340942, + "rewards/verify_math_reward/mean": 0.5111607313156128, + "rewards/verify_math_reward/std": 0.5001546144485474, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.0024237072066171095, + "clip_ratio/high_mean": 0.0009196715491270879, + "clip_ratio/low_mean": 0.0009615821636543842, + "clip_ratio/low_min": 6.521401428472018e-05, + "clip_ratio/region_mean": 0.001881253694591578, + "epoch": 2.7325750947798193, + "grad_norm": 0.2623019516468048, + "learning_rate": 1e-06, + "loss": -0.0595, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0023440176228177734, + "clip_ratio/high_mean": 0.0008444089853583137, + "clip_ratio/low_mean": 0.0010867597684409702, + "clip_ratio/low_min": 5.1829283620463684e-05, + "clip_ratio/region_mean": 0.001931168750161305, + "epoch": 2.7349081364829395, + "grad_norm": 0.26250067353248596, + "learning_rate": 1e-06, + "loss": -0.0595, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.002036090547335334, + "clip_ratio/high_mean": 0.0008630632728454657, + "clip_ratio/low_mean": 0.0012405992401909316, + "clip_ratio/low_min": 8.95601797310519e-05, + "clip_ratio/region_mean": 0.002103662511217408, + "epoch": 2.73724117818606, + "grad_norm": 0.21721115708351135, + "learning_rate": 1e-06, + "loss": -0.0596, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0025588344724383205, + "clip_ratio/high_mean": 0.0009905137867463054, + "clip_ratio/low_mean": 0.00043740399178204825, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014279178067226894, + "completions/clipped_ratio": 0.1607142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3473.0, + "completions/mean_length": 1183.5904541015625, + "completions/mean_terminated_length": 625.8948974609375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 2.7395742198891804, + "grad_norm": 0.3492778539657593, + "learning_rate": 1e-06, + "loss": -0.0756, + "num_tokens": 172629727.0, + "reward": 0.5535714626312256, + "reward_std": 0.14661546051502228, + "rewards/verify_math_reward/mean": 0.5535714030265808, + "rewards/verify_math_reward/std": 0.4973994791507721, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.003237783705117181, + "clip_ratio/high_mean": 0.001139278729169746, + "clip_ratio/low_mean": 0.0006850259969723993, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018243047306896187, + "epoch": 2.741907261592301, + "grad_norm": 0.33572831749916077, + "learning_rate": 1e-06, + "loss": -0.0758, + "step": 1174 + }, + { + "clip_ratio/high_max": 0.002890553376346361, + "clip_ratio/high_mean": 0.0011176306452398421, + "clip_ratio/low_mean": 0.0008286535630759317, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019462841810309328, + "epoch": 2.7442403032954212, + "grad_norm": 0.21871332824230194, + "learning_rate": 1e-06, + "loss": -0.076, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0031910955658531748, + "clip_ratio/high_mean": 0.001188287607874372, + "clip_ratio/low_mean": 0.0009394897433594451, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021277773412293755, + "epoch": 2.746573344998542, + "grad_norm": 0.2713559865951538, + "learning_rate": 1e-06, + "loss": -0.0762, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.002106075917254202, + "clip_ratio/high_mean": 0.000794560561189428, + "clip_ratio/low_mean": 0.0007055580717860721, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001500118654803373, + "completions/clipped_ratio": 0.1305803571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3589.0, + "completions/mean_length": 1118.3046875, + "completions/mean_terminated_length": 671.0770263671875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 2.7489063867016625, + "grad_norm": 0.356035053730011, + "learning_rate": 1e-06, + "loss": -0.0457, + "num_tokens": 173238416.0, + "reward": 0.6071428656578064, + "reward_std": 0.16281278431415558, + "rewards/verify_math_reward/mean": 0.6071428656578064, + "rewards/verify_math_reward/std": 0.48865827918052673, + "step": 1177 + }, + { + "clip_ratio/high_max": 0.0025467468294664286, + "clip_ratio/high_mean": 0.0009412623439857271, + "clip_ratio/low_mean": 0.0008732760998100275, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018145384601666592, + "epoch": 2.7512394284047827, + "grad_norm": 0.24822822213172913, + "learning_rate": 1e-06, + "loss": -0.046, + "step": 1178 + }, + { + "clip_ratio/high_max": 0.002678291952179279, + "clip_ratio/high_mean": 0.0009726418738864595, + "clip_ratio/low_mean": 0.0009707488916319562, + "clip_ratio/low_min": 2.565154863987118e-05, + "clip_ratio/region_mean": 0.001943390758242458, + "epoch": 2.753572470107903, + "grad_norm": 0.2327621877193451, + "learning_rate": 1e-06, + "loss": -0.0461, + "step": 1179 + }, + { + "clip_ratio/high_max": 0.0026214629324385896, + "clip_ratio/high_mean": 0.0008822716972645139, + "clip_ratio/low_mean": 0.0013206969961174764, + "clip_ratio/low_min": 2.565154863987118e-05, + "clip_ratio/region_mean": 0.002202968746132683, + "epoch": 2.7559055118110236, + "grad_norm": 0.23097383975982666, + "learning_rate": 1e-06, + "loss": -0.0462, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0026383646836620755, + "clip_ratio/high_mean": 0.000980668108240934, + "clip_ratio/low_mean": 0.0007353683377004927, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017160364659503102, + "completions/clipped_ratio": 0.1696428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4095.0, + "completions/mean_length": 1247.09716796875, + "completions/mean_terminated_length": 665.0631713867188, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 2.7582385535141443, + "grad_norm": 0.32525336742401123, + "learning_rate": 1e-06, + "loss": -0.0709, + "num_tokens": 173821887.0, + "reward": 0.5345982313156128, + "reward_std": 0.16930988430976868, + "rewards/verify_math_reward/mean": 0.5345982313156128, + "rewards/verify_math_reward/std": 0.4990801215171814, + "step": 1181 + }, + { + "clip_ratio/high_max": 0.002790924299915787, + "clip_ratio/high_mean": 0.0010920978220383404, + "clip_ratio/low_mean": 0.0009414568048669025, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020335546214482747, + "epoch": 2.7605715952172645, + "grad_norm": 0.25325608253479004, + "learning_rate": 1e-06, + "loss": -0.0711, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.002858604333596304, + "clip_ratio/high_mean": 0.0010866996581171406, + "clip_ratio/low_mean": 0.0011052198096876964, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021919194914516993, + "epoch": 2.7629046369203847, + "grad_norm": 0.22945712506771088, + "learning_rate": 1e-06, + "loss": -0.0713, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.002792059247440193, + "clip_ratio/high_mean": 0.0011309509209240787, + "clip_ratio/low_mean": 0.0013075094866508152, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002438460389385, + "epoch": 2.7652376786235053, + "grad_norm": 0.23172591626644135, + "learning_rate": 1e-06, + "loss": -0.0714, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.003013772060512565, + "clip_ratio/high_mean": 0.0011258381564402953, + "clip_ratio/low_mean": 0.0008641459953651065, + "clip_ratio/low_min": 1.304801662627142e-05, + "clip_ratio/region_mean": 0.0019899841499864124, + "completions/clipped_ratio": 0.1227678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3706.0, + "completions/mean_length": 1096.640625, + "completions/mean_terminated_length": 676.8829345703125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 2.767570720326626, + "grad_norm": 0.4517523944377899, + "learning_rate": 1e-06, + "loss": -0.0366, + "num_tokens": 174442205.0, + "reward": 0.5424107313156128, + "reward_std": 0.18754372000694275, + "rewards/verify_math_reward/mean": 0.5424107313156128, + "rewards/verify_math_reward/std": 0.4984763562679291, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.003333640306664165, + "clip_ratio/high_mean": 0.001336344128503697, + "clip_ratio/low_mean": 0.0012563255713757826, + "clip_ratio/low_min": 6.52400849503465e-05, + "clip_ratio/region_mean": 0.002592669588921126, + "epoch": 2.769903762029746, + "grad_norm": 0.3139788508415222, + "learning_rate": 1e-06, + "loss": -0.037, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.0034307411697227508, + "clip_ratio/high_mean": 0.0012566710538521875, + "clip_ratio/low_mean": 0.0014524489160976373, + "clip_ratio/low_min": 5.219206650508568e-05, + "clip_ratio/region_mean": 0.002709119929932058, + "epoch": 2.772236803732867, + "grad_norm": 0.26903483271598816, + "learning_rate": 1e-06, + "loss": -0.0371, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.0035063183604506776, + "clip_ratio/high_mean": 0.00135813991073519, + "clip_ratio/low_mean": 0.0017541509369038977, + "clip_ratio/low_min": 7.303107850020751e-05, + "clip_ratio/region_mean": 0.0031122907894314267, + "epoch": 2.774569845435987, + "grad_norm": 0.2814794182777405, + "learning_rate": 1e-06, + "loss": -0.0373, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0020838035343331285, + "clip_ratio/high_mean": 0.0008119172434817301, + "clip_ratio/low_mean": 0.0005080953824290191, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001320012645010138, + "completions/clipped_ratio": 0.1729910714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4013.0, + "completions/mean_length": 1269.9576416015625, + "completions/mean_terminated_length": 678.8150634765625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 2.7769028871391077, + "grad_norm": 0.2438543736934662, + "learning_rate": 1e-06, + "loss": -0.0851, + "num_tokens": 175037479.0, + "reward": 0.5178571939468384, + "reward_std": 0.1632988154888153, + "rewards/verify_math_reward/mean": 0.5178571343421936, + "rewards/verify_math_reward/std": 0.4999600946903229, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.002365332911722362, + "clip_ratio/high_mean": 0.0009482004315941595, + "clip_ratio/low_mean": 0.000811347272247076, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017595476601854898, + "epoch": 2.779235928842228, + "grad_norm": 0.21394813060760498, + "learning_rate": 1e-06, + "loss": -0.0853, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0028085903213650454, + "clip_ratio/high_mean": 0.001010134254102013, + "clip_ratio/low_mean": 0.0008846170476317639, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018947512508020736, + "epoch": 2.7815689705453486, + "grad_norm": 0.20263056457042694, + "learning_rate": 1e-06, + "loss": -0.0854, + "step": 1191 + }, + { + "clip_ratio/high_max": 0.00252992239620653, + "clip_ratio/high_mean": 0.0009914755883073667, + "clip_ratio/low_mean": 0.0010134298354387283, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020049054437549785, + "epoch": 2.783902012248469, + "grad_norm": 0.20704755187034607, + "learning_rate": 1e-06, + "loss": -0.0854, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.002115512397722341, + "clip_ratio/high_mean": 0.0009101857149289572, + "clip_ratio/low_mean": 0.0007816509087206214, + "clip_ratio/low_min": 1.2021542715956457e-05, + "clip_ratio/region_mean": 0.0016918366600293666, + "completions/clipped_ratio": 0.1629464285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3332.0, + "completions/mean_length": 1241.036865234375, + "completions/mean_terminated_length": 685.2706298828125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 2.7862350539515894, + "grad_norm": 0.339778333902359, + "learning_rate": 1e-06, + "loss": -0.0496, + "num_tokens": 175639264.0, + "reward": 0.5212053656578064, + "reward_std": 0.19377093017101288, + "rewards/verify_math_reward/mean": 0.5212053656578064, + "rewards/verify_math_reward/std": 0.49982914328575134, + "step": 1193 + }, + { + "clip_ratio/high_max": 0.002788091187539976, + "clip_ratio/high_mean": 0.0011203964841115521, + "clip_ratio/low_mean": 0.0010449898163642501, + "clip_ratio/low_min": 1.3845812645740807e-05, + "clip_ratio/region_mean": 0.0021653863150277175, + "epoch": 2.7885680956547096, + "grad_norm": 0.28704142570495605, + "learning_rate": 1e-06, + "loss": -0.0499, + "step": 1194 + }, + { + "clip_ratio/high_max": 0.003262251972046215, + "clip_ratio/high_mean": 0.0012426850935298717, + "clip_ratio/low_mean": 0.0013735083193751052, + "clip_ratio/low_min": 2.6974536012858152e-05, + "clip_ratio/region_mean": 0.002616193421999924, + "epoch": 2.7909011373578303, + "grad_norm": 0.24440589547157288, + "learning_rate": 1e-06, + "loss": -0.0501, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.002734982001129538, + "clip_ratio/high_mean": 0.0011236375848966418, + "clip_ratio/low_mean": 0.0015607085042574909, + "clip_ratio/low_min": 4.940060534863733e-05, + "clip_ratio/region_mean": 0.002684346094611101, + "epoch": 2.793234179060951, + "grad_norm": 0.2630174458026886, + "learning_rate": 1e-06, + "loss": -0.0502, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.003300103453511838, + "clip_ratio/high_mean": 0.001191360746815917, + "clip_ratio/low_mean": 0.0007367967900790973, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019281575951026753, + "completions/clipped_ratio": 0.1573660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3343.0, + "completions/mean_length": 1237.2467041015625, + "completions/mean_terminated_length": 703.3602905273438, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 2.795567220764071, + "grad_norm": 0.30969828367233276, + "learning_rate": 1e-06, + "loss": -0.0785, + "num_tokens": 176254261.0, + "reward": 0.5792410969734192, + "reward_std": 0.16995178163051605, + "rewards/verify_math_reward/mean": 0.5792410969734192, + "rewards/verify_math_reward/std": 0.49395665526390076, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.003714725600730162, + "clip_ratio/high_mean": 0.0013485967392625753, + "clip_ratio/low_mean": 0.0009606522280591889, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002309248950041365, + "epoch": 2.7979002624671914, + "grad_norm": 0.29674312472343445, + "learning_rate": 1e-06, + "loss": -0.0786, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.0038086366112111136, + "clip_ratio/high_mean": 0.0014416351423278684, + "clip_ratio/low_mean": 0.0009979936794479727, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00243962890090188, + "epoch": 2.800233304170312, + "grad_norm": 0.3184277415275574, + "learning_rate": 1e-06, + "loss": -0.0788, + "step": 1199 + }, + { + "clip_ratio/high_max": 0.003577950566977961, + "clip_ratio/high_mean": 0.0012790588716597995, + "clip_ratio/low_mean": 0.001276157284337387, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0025552161896484904, + "epoch": 2.8025663458734327, + "grad_norm": 0.22036480903625488, + "learning_rate": 1e-06, + "loss": -0.079, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0021206787823757622, + "clip_ratio/high_mean": 0.0008117430406855419, + "clip_ratio/low_mean": 0.0005327217286321684, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013444648102449719, + "completions/clipped_ratio": 0.1529017857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3625.0, + "completions/mean_length": 1157.3828125, + "completions/mean_terminated_length": 626.9605102539062, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "epoch": 2.804899387576553, + "grad_norm": 0.26239198446273804, + "learning_rate": 1e-06, + "loss": -0.0505, + "num_tokens": 176810772.0, + "reward": 0.5580357313156128, + "reward_std": 0.14676883816719055, + "rewards/verify_math_reward/mean": 0.5580357313156128, + "rewards/verify_math_reward/std": 0.49689778685569763, + "step": 1201 + }, + { + "clip_ratio/high_max": 0.0025539233974996023, + "clip_ratio/high_mean": 0.0009800572079257108, + "clip_ratio/low_mean": 0.0007396926212095423, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017197498382302001, + "epoch": 2.8072324292796735, + "grad_norm": 0.19713591039180756, + "learning_rate": 1e-06, + "loss": -0.0507, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.002599708161142189, + "clip_ratio/high_mean": 0.0009262524527002824, + "clip_ratio/low_mean": 0.0008012518883333541, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017275043101108167, + "epoch": 2.8095654709827937, + "grad_norm": 0.24003306031227112, + "learning_rate": 1e-06, + "loss": -0.0507, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.002897015765483957, + "clip_ratio/high_mean": 0.0010553792744758539, + "clip_ratio/low_mean": 0.000998868641545414, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002054247903288342, + "epoch": 2.8118985126859144, + "grad_norm": 0.20766811072826385, + "learning_rate": 1e-06, + "loss": -0.0508, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0023108789464458823, + "clip_ratio/high_mean": 0.0009054496049429872, + "clip_ratio/low_mean": 0.0006721101754010306, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015775597821630072, + "completions/clipped_ratio": 0.1852678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2730.0, + "completions/mean_length": 1284.1239013671875, + "completions/mean_terminated_length": 644.7109985351562, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 2.8142315543890346, + "grad_norm": 0.33877506852149963, + "learning_rate": 1e-06, + "loss": -0.0541, + "num_tokens": 177365099.0, + "reward": 0.5245535969734192, + "reward_std": 0.17570818960666656, + "rewards/verify_math_reward/mean": 0.5245535969734192, + "rewards/verify_math_reward/std": 0.4996756613254547, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.002984140723128803, + "clip_ratio/high_mean": 0.0012275164845050313, + "clip_ratio/low_mean": 0.0009287758030041005, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021562922847806476, + "epoch": 2.8165645960921553, + "grad_norm": 0.3308020532131195, + "learning_rate": 1e-06, + "loss": -0.0544, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.003067373127123574, + "clip_ratio/high_mean": 0.0012576047138281865, + "clip_ratio/low_mean": 0.0011606596617639298, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024182643537642434, + "epoch": 2.8188976377952755, + "grad_norm": 0.2738145589828491, + "learning_rate": 1e-06, + "loss": -0.0547, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.0028996927321713883, + "clip_ratio/high_mean": 0.0010932534914900316, + "clip_ratio/low_mean": 0.0014001720101077808, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024934255197877064, + "epoch": 2.821230679498396, + "grad_norm": 0.35904988646507263, + "learning_rate": 1e-06, + "loss": -0.0547, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.00207980446430156, + "clip_ratio/high_mean": 0.0007250223825394642, + "clip_ratio/low_mean": 0.0006736360783179407, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013986584708618466, + "completions/clipped_ratio": 0.1729910714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4080.0, + "completions/mean_length": 1330.8460693359375, + "completions/mean_terminated_length": 752.43994140625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 2.8235637212015163, + "grad_norm": 0.2817864418029785, + "learning_rate": 1e-06, + "loss": -0.0639, + "num_tokens": 178012521.0, + "reward": 0.53125, + "reward_std": 0.16198793053627014, + "rewards/verify_math_reward/mean": 0.53125, + "rewards/verify_math_reward/std": 0.4993011951446533, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.0026954747299896553, + "clip_ratio/high_mean": 0.0008574896291975165, + "clip_ratio/low_mean": 0.0009417580295121297, + "clip_ratio/low_min": 1.9860184693243355e-05, + "clip_ratio/region_mean": 0.001799247671442572, + "epoch": 2.825896762904637, + "grad_norm": 0.2551283538341522, + "learning_rate": 1e-06, + "loss": -0.0641, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0026105109573109075, + "clip_ratio/high_mean": 0.0008668896643939661, + "clip_ratio/low_mean": 0.0011566333250812022, + "clip_ratio/low_min": 2.131287328666076e-05, + "clip_ratio/region_mean": 0.002023522974923253, + "epoch": 2.8282298046077576, + "grad_norm": 0.22527474164962769, + "learning_rate": 1e-06, + "loss": -0.0642, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.002564192283898592, + "clip_ratio/high_mean": 0.0008459131749987137, + "clip_ratio/low_mean": 0.0012112642543797847, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002057177436654456, + "epoch": 2.830562846310878, + "grad_norm": 0.22328247129917145, + "learning_rate": 1e-06, + "loss": -0.0643, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.002531326266762335, + "clip_ratio/high_mean": 0.0010001528316934127, + "clip_ratio/low_mean": 0.0007164640178416448, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017166168399853632, + "completions/clipped_ratio": 0.1473214285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2916.0, + "completions/mean_length": 1123.37060546875, + "completions/mean_terminated_length": 609.77490234375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 2.832895888013998, + "grad_norm": 0.3418889343738556, + "learning_rate": 1e-06, + "loss": -0.0552, + "num_tokens": 178569413.0, + "reward": 0.6004464626312256, + "reward_std": 0.17634011805057526, + "rewards/verify_math_reward/mean": 0.6004464030265808, + "rewards/verify_math_reward/std": 0.49008017778396606, + "step": 1213 + }, + { + "clip_ratio/high_max": 0.002907997964939568, + "clip_ratio/high_mean": 0.0012003184456261806, + "clip_ratio/low_mean": 0.000967797907833301, + "clip_ratio/low_min": 1.5166221601248253e-05, + "clip_ratio/region_mean": 0.0021681163270841353, + "epoch": 2.8352289297171187, + "grad_norm": 0.2897513210773468, + "learning_rate": 1e-06, + "loss": -0.0554, + "step": 1214 + }, + { + "clip_ratio/high_max": 0.002957124510430731, + "clip_ratio/high_mean": 0.0010831455474544782, + "clip_ratio/low_mean": 0.0012507559713412775, + "clip_ratio/low_min": 1.1959433322772384e-05, + "clip_ratio/region_mean": 0.002333901538804639, + "epoch": 2.8375619714202394, + "grad_norm": 0.2763117849826813, + "learning_rate": 1e-06, + "loss": -0.0556, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0029035662228125148, + "clip_ratio/high_mean": 0.001143075820436934, + "clip_ratio/low_mean": 0.001538982873171335, + "clip_ratio/low_min": 3.0332443202496506e-05, + "clip_ratio/region_mean": 0.0026820587008842267, + "epoch": 2.8398950131233596, + "grad_norm": 0.25847601890563965, + "learning_rate": 1e-06, + "loss": -0.0557, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.002246396463306155, + "clip_ratio/high_mean": 0.0008512069762218744, + "clip_ratio/low_mean": 0.0006472689628935768, + "clip_ratio/low_min": 1.4785900020797271e-05, + "clip_ratio/region_mean": 0.0014984759218350518, + "completions/clipped_ratio": 0.1696428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4058.0, + "completions/mean_length": 1240.4609375, + "completions/mean_terminated_length": 657.0712280273438, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 2.8422280548264798, + "grad_norm": 0.3724900484085083, + "learning_rate": 1e-06, + "loss": -0.0451, + "num_tokens": 179157538.0, + "reward": 0.559151828289032, + "reward_std": 0.15642336010932922, + "rewards/verify_math_reward/mean": 0.5591517686843872, + "rewards/verify_math_reward/std": 0.496766060590744, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.002594152738311095, + "clip_ratio/high_mean": 0.0010427189536130754, + "clip_ratio/low_mean": 0.0009653071083448594, + "clip_ratio/low_min": 1.4785900020797271e-05, + "clip_ratio/region_mean": 0.0020080260692338925, + "epoch": 2.8445610965296004, + "grad_norm": 0.27912214398384094, + "learning_rate": 1e-06, + "loss": -0.0455, + "step": 1218 + }, + { + "clip_ratio/high_max": 0.002910590839746874, + "clip_ratio/high_mean": 0.001090924324671505, + "clip_ratio/low_mean": 0.0010437631863169372, + "clip_ratio/low_min": 2.9571800041594543e-05, + "clip_ratio/region_mean": 0.0021346874891605694, + "epoch": 2.846894138232721, + "grad_norm": 0.25943613052368164, + "learning_rate": 1e-06, + "loss": -0.0456, + "step": 1219 + }, + { + "clip_ratio/high_max": 0.00258619509259006, + "clip_ratio/high_mean": 0.0009054034726432292, + "clip_ratio/low_mean": 0.0012875327192887198, + "clip_ratio/low_min": 4.4357700971886516e-05, + "clip_ratio/region_mean": 0.0021929361901129596, + "epoch": 2.8492271799358413, + "grad_norm": 0.23242810368537903, + "learning_rate": 1e-06, + "loss": -0.0457, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.002463534357957542, + "clip_ratio/high_mean": 0.0009862915576377418, + "clip_ratio/low_mean": 0.000740970635888516, + "clip_ratio/low_min": 3.55821248376742e-05, + "clip_ratio/region_mean": 0.0017272621698793955, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3266.0, + "completions/mean_length": 1230.34716796875, + "completions/mean_terminated_length": 635.5889282226562, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 2.851560221638962, + "grad_norm": 0.33978813886642456, + "learning_rate": 1e-06, + "loss": -0.0675, + "num_tokens": 179712281.0, + "reward": 0.5446428656578064, + "reward_std": 0.1726941168308258, + "rewards/verify_math_reward/mean": 0.5446428656578064, + "rewards/verify_math_reward/std": 0.49828118085861206, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.0031510016779066063, + "clip_ratio/high_mean": 0.0011794015936175128, + "clip_ratio/low_mean": 0.0008715644653420895, + "clip_ratio/low_min": 4.572310263029067e-05, + "clip_ratio/region_mean": 0.002050966075330507, + "epoch": 2.853893263342082, + "grad_norm": 0.3055119216442108, + "learning_rate": 1e-06, + "loss": -0.0677, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.0033840225733001716, + "clip_ratio/high_mean": 0.0012274939244889538, + "clip_ratio/low_mean": 0.0011448321802163264, + "clip_ratio/low_min": 1.3261192179925274e-05, + "clip_ratio/region_mean": 0.0023723261037957855, + "epoch": 2.856226305045203, + "grad_norm": 0.2623078227043152, + "learning_rate": 1e-06, + "loss": -0.068, + "step": 1223 + }, + { + "clip_ratio/high_max": 0.0027565543532546144, + "clip_ratio/high_mean": 0.0010786218917928636, + "clip_ratio/low_mean": 0.0014120109372015577, + "clip_ratio/low_min": 9.161041270999704e-05, + "clip_ratio/region_mean": 0.0024906328108045273, + "epoch": 2.858559346748323, + "grad_norm": 0.2556232213973999, + "learning_rate": 1e-06, + "loss": -0.068, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.002132123066985514, + "clip_ratio/high_mean": 0.000865826683366322, + "clip_ratio/low_mean": 0.0004899367630741835, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013557634192693513, + "completions/clipped_ratio": 0.1841517857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3233.0, + "completions/mean_length": 1271.3717041015625, + "completions/mean_terminated_length": 633.8016967773438, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 2.8608923884514437, + "grad_norm": 0.3052181601524353, + "learning_rate": 1e-06, + "loss": -0.0542, + "num_tokens": 180259150.0, + "reward": 0.5345982313156128, + "reward_std": 0.1515420526266098, + "rewards/verify_math_reward/mean": 0.5345982313156128, + "rewards/verify_math_reward/std": 0.4990801215171814, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.00272468279581517, + "clip_ratio/high_mean": 0.0010511810796742793, + "clip_ratio/low_mean": 0.0006787887932659942, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017299698738497682, + "epoch": 2.863225430154564, + "grad_norm": 0.3244656026363373, + "learning_rate": 1e-06, + "loss": -0.0545, + "step": 1226 + }, + { + "clip_ratio/high_max": 0.0026379339178674854, + "clip_ratio/high_mean": 0.001092517632059753, + "clip_ratio/low_mean": 0.0008542970224425517, + "clip_ratio/low_min": 2.0647505152737722e-05, + "clip_ratio/region_mean": 0.0019468145837890916, + "epoch": 2.8655584718576845, + "grad_norm": 0.2317638099193573, + "learning_rate": 1e-06, + "loss": -0.0546, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.002808631288644392, + "clip_ratio/high_mean": 0.0010549850030656671, + "clip_ratio/low_mean": 0.0010281999293511035, + "clip_ratio/low_min": 1.575497844896745e-05, + "clip_ratio/region_mean": 0.002083184925140813, + "epoch": 2.8678915135608047, + "grad_norm": 0.2297726422548294, + "learning_rate": 1e-06, + "loss": -0.0546, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0020344149161246605, + "clip_ratio/high_mean": 0.0008280365382233867, + "clip_ratio/low_mean": 0.0006267763910727808, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001454812940210104, + "completions/clipped_ratio": 0.1339285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2822.0, + "completions/mean_length": 1098.884033203125, + "completions/mean_terminated_length": 635.412353515625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 2.8702245552639254, + "grad_norm": 0.37846052646636963, + "learning_rate": 1e-06, + "loss": -0.0355, + "num_tokens": 180839038.0, + "reward": 0.6049107313156128, + "reward_std": 0.1555236279964447, + "rewards/verify_math_reward/mean": 0.6049107313156128, + "rewards/verify_math_reward/std": 0.48914292454719543, + "step": 1229 + }, + { + "clip_ratio/high_max": 0.002511028382286895, + "clip_ratio/high_mean": 0.0010764880844362779, + "clip_ratio/low_mean": 0.0007981017679412616, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018745899287750944, + "epoch": 2.872557596967046, + "grad_norm": 0.2716333568096161, + "learning_rate": 1e-06, + "loss": -0.0358, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0027851527556777, + "clip_ratio/high_mean": 0.0010962301967083476, + "clip_ratio/low_mean": 0.0010780406737467274, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00217427085590316, + "epoch": 2.8748906386701663, + "grad_norm": 0.24592313170433044, + "learning_rate": 1e-06, + "loss": -0.0361, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.0025209656268998515, + "clip_ratio/high_mean": 0.000988474640507775, + "clip_ratio/low_mean": 0.0013377151772147045, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023261897658812813, + "epoch": 2.8772236803732865, + "grad_norm": 0.2778870463371277, + "learning_rate": 1e-06, + "loss": -0.0361, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0020293524794396944, + "clip_ratio/high_mean": 0.0008347573566425126, + "clip_ratio/low_mean": 0.0005649715913023101, + "clip_ratio/low_min": 1.323311425949214e-05, + "clip_ratio/region_mean": 0.0013997289497638121, + "completions/clipped_ratio": 0.1707589285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3990.0, + "completions/mean_length": 1249.10498046875, + "completions/mean_terminated_length": 662.8667602539062, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 2.879556722076407, + "grad_norm": 0.2888152599334717, + "learning_rate": 1e-06, + "loss": -0.0342, + "num_tokens": 181418684.0, + "reward": 0.520089328289032, + "reward_std": 0.1504133939743042, + "rewards/verify_math_reward/mean": 0.5200892686843872, + "rewards/verify_math_reward/std": 0.4998753070831299, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.0025165256338368636, + "clip_ratio/high_mean": 0.0009329646563855931, + "clip_ratio/low_mean": 0.0008046613766055088, + "clip_ratio/low_min": 1.8296252164873295e-05, + "clip_ratio/region_mean": 0.0017376260075252503, + "epoch": 2.8818897637795278, + "grad_norm": 0.22874966263771057, + "learning_rate": 1e-06, + "loss": -0.0344, + "step": 1234 + }, + { + "clip_ratio/high_max": 0.00254748542465677, + "clip_ratio/high_mean": 0.0010051646777355927, + "clip_ratio/low_mean": 0.0009650562169554178, + "clip_ratio/low_min": 1.3403388038568664e-05, + "clip_ratio/region_mean": 0.001970220921066357, + "epoch": 2.884222805482648, + "grad_norm": 0.21959078311920166, + "learning_rate": 1e-06, + "loss": -0.0345, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0026146117743337527, + "clip_ratio/high_mean": 0.000941455909924116, + "clip_ratio/low_mean": 0.0010380630519648548, + "clip_ratio/low_min": 5.293245703796856e-05, + "clip_ratio/region_mean": 0.00197951905647642, + "epoch": 2.886555847185768, + "grad_norm": 0.28595098853111267, + "learning_rate": 1e-06, + "loss": -0.0346, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.002767169673461467, + "clip_ratio/high_mean": 0.0011425521242927061, + "clip_ratio/low_mean": 0.0009676702266006032, + "clip_ratio/low_min": 4.705656374426326e-05, + "clip_ratio/region_mean": 0.0021102223327034153, + "completions/clipped_ratio": 0.1662946428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2665.0, + "completions/mean_length": 1280.4320068359375, + "completions/mean_terminated_length": 718.8259887695312, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 2.888888888888889, + "grad_norm": 0.3602418303489685, + "learning_rate": 1e-06, + "loss": -0.0748, + "num_tokens": 182040983.0, + "reward": 0.4988839626312256, + "reward_std": 0.20282992720603943, + "rewards/verify_math_reward/mean": 0.4988839328289032, + "rewards/verify_math_reward/std": 0.5002779960632324, + "step": 1237 + }, + { + "clip_ratio/high_max": 0.003788474656175822, + "clip_ratio/high_mean": 0.0013509797827282455, + "clip_ratio/low_mean": 0.0012202237157907803, + "clip_ratio/low_min": 2.8419009140634444e-05, + "clip_ratio/region_mean": 0.0025712034985190257, + "epoch": 2.8912219305920095, + "grad_norm": 0.2973235547542572, + "learning_rate": 1e-06, + "loss": -0.0751, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.0034605427645146847, + "clip_ratio/high_mean": 0.0013756530061073136, + "clip_ratio/low_mean": 0.0014561256757588126, + "clip_ratio/low_min": 9.341506392956944e-05, + "clip_ratio/region_mean": 0.002831778627296444, + "epoch": 2.8935549722951297, + "grad_norm": 0.2692912220954895, + "learning_rate": 1e-06, + "loss": -0.0753, + "step": 1239 + }, + { + "clip_ratio/high_max": 0.0036129713262198493, + "clip_ratio/high_mean": 0.0013319130812305957, + "clip_ratio/low_mean": 0.001683001533820061, + "clip_ratio/low_min": 7.718307097093202e-05, + "clip_ratio/region_mean": 0.0030149146332405508, + "epoch": 2.8958880139982504, + "grad_norm": 0.27325916290283203, + "learning_rate": 1e-06, + "loss": -0.0754, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0032859176135389134, + "clip_ratio/high_mean": 0.0013003923741052859, + "clip_ratio/low_mean": 0.0005826023161716876, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018829947512131184, + "completions/clipped_ratio": 0.1584821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3380.0, + "completions/mean_length": 1168.388427734375, + "completions/mean_terminated_length": 617.0344848632812, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 2.8982210557013706, + "grad_norm": 0.4320470690727234, + "learning_rate": 1e-06, + "loss": -0.0908, + "num_tokens": 182605411.0, + "reward": 0.6127232313156128, + "reward_std": 0.17979852855205536, + "rewards/verify_math_reward/mean": 0.6127232313156128, + "rewards/verify_math_reward/std": 0.4873998463153839, + "step": 1241 + }, + { + "clip_ratio/high_max": 0.003133914178761188, + "clip_ratio/high_mean": 0.0012624690498341806, + "clip_ratio/low_mean": 0.0008868542624895781, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021493233216460794, + "epoch": 2.900554097404491, + "grad_norm": 0.33980104327201843, + "learning_rate": 1e-06, + "loss": -0.091, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.003558458687621169, + "clip_ratio/high_mean": 0.0014487796688626986, + "clip_ratio/low_mean": 0.0011010279954462021, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002549807686591521, + "epoch": 2.9028871391076114, + "grad_norm": 0.2559306025505066, + "learning_rate": 1e-06, + "loss": -0.0913, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.0033493489027023315, + "clip_ratio/high_mean": 0.0013537378908949904, + "clip_ratio/low_mean": 0.0012544961227831664, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002608234019135125, + "epoch": 2.905220180810732, + "grad_norm": 0.26972684264183044, + "learning_rate": 1e-06, + "loss": -0.0913, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0020876192866126075, + "clip_ratio/high_mean": 0.0006878449212308624, + "clip_ratio/low_mean": 0.0005604099387710448, + "clip_ratio/low_min": 1.269551103177946e-05, + "clip_ratio/region_mean": 0.0012482548845582642, + "completions/clipped_ratio": 0.1808035714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3991.0, + "completions/mean_length": 1270.688720703125, + "completions/mean_terminated_length": 647.1185302734375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 2.9075532225138523, + "grad_norm": 0.23925110697746277, + "learning_rate": 1e-06, + "loss": -0.0772, + "num_tokens": 183170740.0, + "reward": 0.5680803656578064, + "reward_std": 0.14078985154628754, + "rewards/verify_math_reward/mean": 0.5680803656578064, + "rewards/verify_math_reward/std": 0.4956200420856476, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.002451334377838066, + "clip_ratio/high_mean": 0.0008019774531931034, + "clip_ratio/low_mean": 0.0007614677442688844, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001563445184729062, + "epoch": 2.909886264216973, + "grad_norm": 0.24351325631141663, + "learning_rate": 1e-06, + "loss": -0.0773, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.002328195438167313, + "clip_ratio/high_mean": 0.0008338033185282256, + "clip_ratio/low_mean": 0.0008368225990125211, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001670625941187609, + "epoch": 2.912219305920093, + "grad_norm": 0.23359844088554382, + "learning_rate": 1e-06, + "loss": -0.0774, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.0023826419565011747, + "clip_ratio/high_mean": 0.0008263971176347695, + "clip_ratio/low_mean": 0.0010366100559622282, + "clip_ratio/low_min": 2.891510484914761e-05, + "clip_ratio/region_mean": 0.0018630072263476904, + "epoch": 2.914552347623214, + "grad_norm": 0.1954878568649292, + "learning_rate": 1e-06, + "loss": -0.0775, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.002909617527620867, + "clip_ratio/high_mean": 0.001141853274020832, + "clip_ratio/low_mean": 0.0006739021519024391, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018157554331992287, + "completions/clipped_ratio": 0.1997767857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3484.0, + "completions/mean_length": 1377.587158203125, + "completions/mean_terminated_length": 698.931640625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 2.9168853893263345, + "grad_norm": 0.3355540931224823, + "learning_rate": 1e-06, + "loss": -0.1229, + "num_tokens": 183765530.0, + "reward": 0.5133928656578064, + "reward_std": 0.2020464688539505, + "rewards/verify_math_reward/mean": 0.5133928656578064, + "rewards/verify_math_reward/std": 0.500099778175354, + "step": 1249 + }, + { + "clip_ratio/high_max": 0.003443351000896655, + "clip_ratio/high_mean": 0.0013787734824290965, + "clip_ratio/low_mean": 0.0009193578698614147, + "clip_ratio/low_min": 1.67358411999885e-05, + "clip_ratio/region_mean": 0.002298131315910723, + "epoch": 2.9192184310294547, + "grad_norm": 0.3063414394855499, + "learning_rate": 1e-06, + "loss": -0.1232, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.003658572015410755, + "clip_ratio/high_mean": 0.0014169354108162224, + "clip_ratio/low_mean": 0.0011746121272153687, + "clip_ratio/low_min": 8.36792059999425e-06, + "clip_ratio/region_mean": 0.0025915475343936123, + "epoch": 2.921551472732575, + "grad_norm": 0.24443690478801727, + "learning_rate": 1e-06, + "loss": -0.1235, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.0033247855681111105, + "clip_ratio/high_mean": 0.0012643429545278195, + "clip_ratio/low_mean": 0.0012776335424860008, + "clip_ratio/low_min": 1.67358411999885e-05, + "clip_ratio/region_mean": 0.002541976544307545, + "epoch": 2.9238845144356955, + "grad_norm": 0.2677989602088928, + "learning_rate": 1e-06, + "loss": -0.1235, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0031514803486061282, + "clip_ratio/high_mean": 0.0010818075570568908, + "clip_ratio/low_mean": 0.0005988120160509425, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016806195926619694, + "completions/clipped_ratio": 0.2265625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3857.0, + "completions/mean_length": 1470.782470703125, + "completions/mean_terminated_length": 701.7792358398438, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 2.926217556138816, + "grad_norm": 0.36490005254745483, + "learning_rate": 1e-06, + "loss": -0.0912, + "num_tokens": 184341335.0, + "reward": 0.5033482313156128, + "reward_std": 0.17340727150440216, + "rewards/verify_math_reward/mean": 0.5033482313156128, + "rewards/verify_math_reward/std": 0.5002680420875549, + "step": 1253 + }, + { + "clip_ratio/high_max": 0.003511617695039604, + "clip_ratio/high_mean": 0.0012043408587487647, + "clip_ratio/low_mean": 0.0009400233684573323, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00214436421811115, + "epoch": 2.9285505978419364, + "grad_norm": 0.3625325560569763, + "learning_rate": 1e-06, + "loss": -0.0915, + "step": 1254 + }, + { + "clip_ratio/high_max": 0.0036964921237085946, + "clip_ratio/high_mean": 0.0013299772872414906, + "clip_ratio/low_mean": 0.0011411833074816968, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024711605874472298, + "epoch": 2.9308836395450566, + "grad_norm": 0.2712169885635376, + "learning_rate": 1e-06, + "loss": -0.0917, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0035515071940608323, + "clip_ratio/high_mean": 0.0011886032934853574, + "clip_ratio/low_mean": 0.001462831896787975, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026514351993682794, + "epoch": 2.9332166812481772, + "grad_norm": 0.28189143538475037, + "learning_rate": 1e-06, + "loss": -0.0918, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0029168042092351243, + "clip_ratio/high_mean": 0.0011313008290017024, + "clip_ratio/low_mean": 0.0006421525376936188, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017734533903421834, + "completions/clipped_ratio": 0.1729910714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3267.0, + "completions/mean_length": 1248.1785888671875, + "completions/mean_terminated_length": 652.4804077148438, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 2.935549722951298, + "grad_norm": 0.32276833057403564, + "learning_rate": 1e-06, + "loss": -0.0691, + "num_tokens": 184919535.0, + "reward": 0.5446428656578064, + "reward_std": 0.17130404710769653, + "rewards/verify_math_reward/mean": 0.5446428656578064, + "rewards/verify_math_reward/std": 0.4982811510562897, + "step": 1257 + }, + { + "clip_ratio/high_max": 0.003362276627740357, + "clip_ratio/high_mean": 0.0013551082702178974, + "clip_ratio/low_mean": 0.0008055710422922857, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021606793234241195, + "epoch": 2.937882764654418, + "grad_norm": 0.2880367338657379, + "learning_rate": 1e-06, + "loss": -0.0692, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.0032669836000422947, + "clip_ratio/high_mean": 0.0013611893409688491, + "clip_ratio/low_mean": 0.0010897846113948617, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024509740105713718, + "epoch": 2.9402158063575388, + "grad_norm": 0.2908535301685333, + "learning_rate": 1e-06, + "loss": -0.0694, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.003219017293304205, + "clip_ratio/high_mean": 0.0012895310610474553, + "clip_ratio/low_mean": 0.0012497504794737324, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002539281500503421, + "epoch": 2.942548848060659, + "grad_norm": 0.25888124108314514, + "learning_rate": 1e-06, + "loss": -0.0695, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0031690335308667272, + "clip_ratio/high_mean": 0.0010736659132817294, + "clip_ratio/low_mean": 0.000851191491165082, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019248574171797372, + "completions/clipped_ratio": 0.1316964285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4078.0, + "completions/mean_length": 1107.841552734375, + "completions/mean_terminated_length": 654.6246948242188, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 2.9448818897637796, + "grad_norm": 0.41687285900115967, + "learning_rate": 1e-06, + "loss": -0.0384, + "num_tokens": 185523905.0, + "reward": 0.5714285969734192, + "reward_std": 0.17070811986923218, + "rewards/verify_math_reward/mean": 0.5714285969734192, + "rewards/verify_math_reward/std": 0.49514803290367126, + "step": 1261 + }, + { + "clip_ratio/high_max": 0.003776175442908425, + "clip_ratio/high_mean": 0.0013129156450304436, + "clip_ratio/low_mean": 0.0011358087704138597, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024487245027557947, + "epoch": 2.9472149314669, + "grad_norm": 0.31890761852264404, + "learning_rate": 1e-06, + "loss": -0.0386, + "step": 1262 + }, + { + "clip_ratio/high_max": 0.0034182021554443054, + "clip_ratio/high_mean": 0.0012003357005596627, + "clip_ratio/low_mean": 0.0012306724784139078, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024310082371812314, + "epoch": 2.9495479731700205, + "grad_norm": 0.31810462474823, + "learning_rate": 1e-06, + "loss": -0.0388, + "step": 1263 + }, + { + "clip_ratio/high_max": 0.0037536010349867865, + "clip_ratio/high_mean": 0.0012760615500155836, + "clip_ratio/low_mean": 0.0015386828272312414, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0028147443663328886, + "epoch": 2.9518810148731407, + "grad_norm": 0.330525279045105, + "learning_rate": 1e-06, + "loss": -0.0389, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.003201582541805692, + "clip_ratio/high_mean": 0.0011561989849724341, + "clip_ratio/low_mean": 0.0007509991464758059, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019071980932494625, + "completions/clipped_ratio": 0.2131696428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3647.0, + "completions/mean_length": 1442.813720703125, + "completions/mean_terminated_length": 724.007080078125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 2.9542140565762613, + "grad_norm": 0.3820577561855316, + "learning_rate": 1e-06, + "loss": -0.1146, + "num_tokens": 186121642.0, + "reward": 0.520089328289032, + "reward_std": 0.19857734441757202, + "rewards/verify_math_reward/mean": 0.5200892686843872, + "rewards/verify_math_reward/std": 0.4998753070831299, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0037811147049069405, + "clip_ratio/high_mean": 0.001389671135257231, + "clip_ratio/low_mean": 0.0010202501507592387, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002409921267826576, + "epoch": 2.9565470982793816, + "grad_norm": 0.27282702922821045, + "learning_rate": 1e-06, + "loss": -0.1149, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.0034939453471451998, + "clip_ratio/high_mean": 0.0013106262995279394, + "clip_ratio/low_mean": 0.001251272336958209, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002561898661952, + "epoch": 2.958880139982502, + "grad_norm": 0.27828308939933777, + "learning_rate": 1e-06, + "loss": -0.1151, + "step": 1267 + }, + { + "clip_ratio/high_max": 0.0037431169039336964, + "clip_ratio/high_mean": 0.0013020098085689824, + "clip_ratio/low_mean": 0.001392197300447151, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0026942070908262394, + "epoch": 2.961213181685623, + "grad_norm": 0.3036958575248718, + "learning_rate": 1e-06, + "loss": -0.1152, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0024892619549063966, + "clip_ratio/high_mean": 0.0008576363125030184, + "clip_ratio/low_mean": 0.0006019526936142938, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001459589009755291, + "completions/clipped_ratio": 0.1674107142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2883.0, + "completions/mean_length": 1226.3046875, + "completions/mean_terminated_length": 649.2882080078125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 2.963546223388743, + "grad_norm": 0.30294013023376465, + "learning_rate": 1e-06, + "loss": -0.088, + "num_tokens": 186695163.0, + "reward": 0.5636160969734192, + "reward_std": 0.1510867178440094, + "rewards/verify_math_reward/mean": 0.5636160969734192, + "rewards/verify_math_reward/std": 0.49621346592903137, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.0030007847453816794, + "clip_ratio/high_mean": 0.0010451485723024234, + "clip_ratio/low_mean": 0.0007863998307584552, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018315484121558256, + "epoch": 2.9658792650918633, + "grad_norm": 0.2306789755821228, + "learning_rate": 1e-06, + "loss": -0.0882, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.002864803740521893, + "clip_ratio/high_mean": 0.0010004250652855262, + "clip_ratio/low_mean": 0.0008507482689310564, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018511733578634448, + "epoch": 2.968212306794984, + "grad_norm": 0.23171015083789825, + "learning_rate": 1e-06, + "loss": -0.0883, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.0022417276850319467, + "clip_ratio/high_mean": 0.0008938817300077062, + "clip_ratio/low_mean": 0.0010590481469989754, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019529298806446604, + "epoch": 2.9705453484981046, + "grad_norm": 0.21633273363113403, + "learning_rate": 1e-06, + "loss": -0.0883, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.002702313297049841, + "clip_ratio/high_mean": 0.000888308519279235, + "clip_ratio/low_mean": 0.0006352464279189007, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015235549326462205, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4057.0, + "completions/mean_length": 1356.8382568359375, + "completions/mean_terminated_length": 658.6204833984375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 2.972878390201225, + "grad_norm": 0.3829348683357239, + "learning_rate": 1e-06, + "loss": -0.0772, + "num_tokens": 187255202.0, + "reward": 0.5223214626312256, + "reward_std": 0.15774601697921753, + "rewards/verify_math_reward/mean": 0.5223214030265808, + "rewards/verify_math_reward/std": 0.49978047609329224, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.003664480900624767, + "clip_ratio/high_mean": 0.001145835303759668, + "clip_ratio/low_mean": 0.0008248937992902938, + "clip_ratio/low_min": 1.3519359526981134e-05, + "clip_ratio/region_mean": 0.001970729099411983, + "epoch": 2.9752114319043454, + "grad_norm": 0.2908918261528015, + "learning_rate": 1e-06, + "loss": -0.0773, + "step": 1274 + }, + { + "clip_ratio/high_max": 0.0035258611969766207, + "clip_ratio/high_mean": 0.0011402282234485028, + "clip_ratio/low_mean": 0.0010692770429159282, + "clip_ratio/low_min": 1.970987068489194e-05, + "clip_ratio/region_mean": 0.0022095053136581555, + "epoch": 2.9775444736074657, + "grad_norm": 0.2749641239643097, + "learning_rate": 1e-06, + "loss": -0.0776, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0031806512270122766, + "clip_ratio/high_mean": 0.001110293977035326, + "clip_ratio/low_mean": 0.0012063731919624843, + "clip_ratio/low_min": 1.970987068489194e-05, + "clip_ratio/region_mean": 0.0023166672108345665, + "epoch": 2.9798775153105863, + "grad_norm": 0.2742725610733032, + "learning_rate": 1e-06, + "loss": -0.0776, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0028186667332192883, + "clip_ratio/high_mean": 0.0011238940751354676, + "clip_ratio/low_mean": 0.0006910904971846321, + "clip_ratio/low_min": 3.5132095945300534e-05, + "clip_ratio/region_mean": 0.0018149845491279848, + "completions/clipped_ratio": 0.1495535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4083.0, + "completions/mean_length": 1191.9520263671875, + "completions/mean_terminated_length": 681.2664184570312, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 2.9822105570137065, + "grad_norm": 0.3575637936592102, + "learning_rate": 1e-06, + "loss": -0.0632, + "num_tokens": 187871031.0, + "reward": 0.6194196939468384, + "reward_std": 0.17720773816108704, + "rewards/verify_math_reward/mean": 0.6194196343421936, + "rewards/verify_math_reward/std": 0.48580074310302734, + "step": 1277 + }, + { + "clip_ratio/high_max": 0.003162377870467026, + "clip_ratio/high_mean": 0.0012078278268745635, + "clip_ratio/low_mean": 0.0010228057308268035, + "clip_ratio/low_min": 3.4270047763129696e-05, + "clip_ratio/region_mean": 0.002230633508588653, + "epoch": 2.984543598716827, + "grad_norm": 0.2947334051132202, + "learning_rate": 1e-06, + "loss": -0.0634, + "step": 1278 + }, + { + "clip_ratio/high_max": 0.003265526691393461, + "clip_ratio/high_mean": 0.0011708471392921638, + "clip_ratio/low_mean": 0.0011104116874776082, + "clip_ratio/low_min": 1.7566047972650267e-05, + "clip_ratio/region_mean": 0.0022812587922089733, + "epoch": 2.9868766404199474, + "grad_norm": 0.24825675785541534, + "learning_rate": 1e-06, + "loss": -0.0636, + "step": 1279 + }, + { + "clip_ratio/high_max": 0.0033663997164694592, + "clip_ratio/high_mean": 0.0013209114513301756, + "clip_ratio/low_mean": 0.0014029893063707277, + "clip_ratio/low_min": 5.3065035899635404e-05, + "clip_ratio/region_mean": 0.002723900804994628, + "epoch": 2.989209682123068, + "grad_norm": 0.2688109576702118, + "learning_rate": 1e-06, + "loss": -0.0637, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0024496471014572307, + "clip_ratio/high_mean": 0.0010025371338997502, + "clip_ratio/low_mean": 0.0006610128039028496, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016635499450785574, + "completions/clipped_ratio": 0.2020089285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3437.0, + "completions/mean_length": 1351.2913818359375, + "completions/mean_terminated_length": 656.4769287109375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 2.9915427238261882, + "grad_norm": 0.4109784662723541, + "learning_rate": 1e-06, + "loss": -0.0831, + "num_tokens": 188444508.0, + "reward": 0.5725446939468384, + "reward_std": 0.16671767830848694, + "rewards/verify_math_reward/mean": 0.5725446343421936, + "rewards/verify_math_reward/std": 0.49498558044433594, + "step": 1281 + }, + { + "clip_ratio/high_max": 0.0033538812494953163, + "clip_ratio/high_mean": 0.0012586799457494635, + "clip_ratio/low_mean": 0.0009316196228610352, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002190299579524435, + "epoch": 2.993875765529309, + "grad_norm": 0.3125744163990021, + "learning_rate": 1e-06, + "loss": -0.0833, + "step": 1282 + }, + { + "clip_ratio/high_max": 0.0035722587344935164, + "clip_ratio/high_mean": 0.0013289053640619386, + "clip_ratio/low_mean": 0.0011280564067419618, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002456961723510176, + "epoch": 2.9962088072324295, + "grad_norm": 0.2602030038833618, + "learning_rate": 1e-06, + "loss": -0.0835, + "step": 1283 + }, + { + "clip_ratio/high_max": 0.0030436663510045037, + "clip_ratio/high_mean": 0.0011226604074181523, + "clip_ratio/low_mean": 0.0013129749859217554, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024356354406336322, + "epoch": 2.9985418489355498, + "grad_norm": 0.2609206438064575, + "learning_rate": 1e-06, + "loss": -0.0836, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0025205490383086726, + "clip_ratio/high_mean": 0.0008495939018757781, + "clip_ratio/low_mean": 0.000519059556609136, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013686535021406598, + "completions/clipped_ratio": 0.2243303571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3577.0, + "completions/mean_length": 1500.4029541015625, + "completions/mean_terminated_length": 749.7338256835938, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 3.0023330417031207, + "grad_norm": 0.32256942987442017, + "learning_rate": 1e-06, + "loss": -0.0761, + "num_tokens": 189055861.0, + "reward": 0.4720982313156128, + "reward_std": 0.15582603216171265, + "rewards/verify_math_reward/mean": 0.4720982015132904, + "rewards/verify_math_reward/std": 0.49949970841407776, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.002837576321326196, + "clip_ratio/high_mean": 0.0010215287147730123, + "clip_ratio/low_mean": 0.0007602082423545653, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001781736958946567, + "epoch": 3.004666083406241, + "grad_norm": 0.23747889697551727, + "learning_rate": 1e-06, + "loss": -0.0764, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.0030404913741222117, + "clip_ratio/high_mean": 0.0010446386586409062, + "clip_ratio/low_mean": 0.0009466475057706703, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001991286117117852, + "epoch": 3.0069991251093615, + "grad_norm": 0.23243752121925354, + "learning_rate": 1e-06, + "loss": -0.0765, + "step": 1287 + }, + { + "clip_ratio/high_max": 0.0028408026046236046, + "clip_ratio/high_mean": 0.0010033117487182608, + "clip_ratio/low_mean": 0.001017110549582867, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002020422303758096, + "epoch": 3.0093321668124817, + "grad_norm": 0.2571777403354645, + "learning_rate": 1e-06, + "loss": -0.0766, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0020180494066153187, + "clip_ratio/high_mean": 0.0007340943420786061, + "clip_ratio/low_mean": 0.0005057745229350985, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012398688468238106, + "completions/clipped_ratio": 0.1830357142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3731.0, + "completions/mean_length": 1326.7890625, + "completions/mean_terminated_length": 706.36474609375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 3.0116652085156024, + "grad_norm": 0.350341796875, + "learning_rate": 1e-06, + "loss": -0.0606, + "num_tokens": 189655416.0, + "reward": 0.5703125, + "reward_std": 0.13527169823646545, + "rewards/verify_math_reward/mean": 0.5703125, + "rewards/verify_math_reward/std": 0.49530795216560364, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.0026527890586294234, + "clip_ratio/high_mean": 0.0009219521225531935, + "clip_ratio/low_mean": 0.0006422939604817657, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015642460857634433, + "epoch": 3.0139982502187226, + "grad_norm": 0.23119445145130157, + "learning_rate": 1e-06, + "loss": -0.0608, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0021421724777610507, + "clip_ratio/high_mean": 0.0008223191680372111, + "clip_ratio/low_mean": 0.0007526751051045721, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015749942758702673, + "epoch": 3.0163312919218432, + "grad_norm": 0.21958255767822266, + "learning_rate": 1e-06, + "loss": -0.061, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.002493561652954668, + "clip_ratio/high_mean": 0.0009063939414772904, + "clip_ratio/low_mean": 0.0010313017191947438, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019376956952328328, + "epoch": 3.0186643336249634, + "grad_norm": 0.20196911692619324, + "learning_rate": 1e-06, + "loss": -0.0611, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.002537009750085417, + "clip_ratio/high_mean": 0.000978603291514446, + "clip_ratio/low_mean": 0.0005903242995373148, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015689275933254976, + "completions/clipped_ratio": 0.2075892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3190.0, + "completions/mean_length": 1396.829345703125, + "completions/mean_terminated_length": 689.7225341796875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 3.020997375328084, + "grad_norm": 0.375221848487854, + "learning_rate": 1e-06, + "loss": -0.087, + "num_tokens": 190234783.0, + "reward": 0.5256696939468384, + "reward_std": 0.19271855056285858, + "rewards/verify_math_reward/mean": 0.5256696343421936, + "rewards/verify_math_reward/std": 0.4996195435523987, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.0033749637150322087, + "clip_ratio/high_mean": 0.001275835860724328, + "clip_ratio/low_mean": 0.0007964365959196584, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002072272458462976, + "epoch": 3.0233304170312043, + "grad_norm": 0.26271942257881165, + "learning_rate": 1e-06, + "loss": -0.0873, + "step": 1294 + }, + { + "clip_ratio/high_max": 0.003290289307187777, + "clip_ratio/high_mean": 0.0012362351626507007, + "clip_ratio/low_mean": 0.0009485341597610386, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002184769276937004, + "epoch": 3.025663458734325, + "grad_norm": 0.2825155258178711, + "learning_rate": 1e-06, + "loss": -0.0875, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0031547732069157064, + "clip_ratio/high_mean": 0.0012714094336843118, + "clip_ratio/low_mean": 0.0011926287315873196, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024640381743665785, + "epoch": 3.027996500437445, + "grad_norm": 0.2432880997657776, + "learning_rate": 1e-06, + "loss": -0.0875, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0024913674205890857, + "clip_ratio/high_mean": 0.0008098549060377991, + "clip_ratio/low_mean": 0.0005295911487337435, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013394460802373942, + "completions/clipped_ratio": 0.1863839285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4066.0, + "completions/mean_length": 1315.8248291015625, + "completions/mean_terminated_length": 678.9396362304688, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 3.030329542140566, + "grad_norm": 0.2805899977684021, + "learning_rate": 1e-06, + "loss": -0.0622, + "num_tokens": 190820322.0, + "reward": 0.5625, + "reward_std": 0.1513993740081787, + "rewards/verify_math_reward/mean": 0.5625, + "rewards/verify_math_reward/std": 0.49635544419288635, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.002850104974640999, + "clip_ratio/high_mean": 0.0009773864730959758, + "clip_ratio/low_mean": 0.0007491319574910449, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017265184615098406, + "epoch": 3.032662583843686, + "grad_norm": 0.238377183675766, + "learning_rate": 1e-06, + "loss": -0.0624, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.003161500957503449, + "clip_ratio/high_mean": 0.0009961924697563518, + "clip_ratio/low_mean": 0.0009126104705501348, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019088029439444654, + "epoch": 3.0349956255468067, + "grad_norm": 0.228261336684227, + "learning_rate": 1e-06, + "loss": -0.0626, + "step": 1299 + }, + { + "clip_ratio/high_max": 0.002946636486740317, + "clip_ratio/high_mean": 0.0009655360481701791, + "clip_ratio/low_mean": 0.0010455251995153958, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020110612640564796, + "epoch": 3.037328667249927, + "grad_norm": 0.22765207290649414, + "learning_rate": 1e-06, + "loss": -0.0626, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.002572289333329536, + "clip_ratio/high_mean": 0.0010175790775974747, + "clip_ratio/low_mean": 0.0005612465474769124, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015788256023370195, + "completions/clipped_ratio": 0.1629464285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3429.0, + "completions/mean_length": 1212.0491943359375, + "completions/mean_terminated_length": 650.6400146484375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 3.0396617089530475, + "grad_norm": 0.3184731900691986, + "learning_rate": 1e-06, + "loss": -0.0827, + "num_tokens": 191388566.0, + "reward": 0.609375, + "reward_std": 0.1598842889070511, + "rewards/verify_math_reward/mean": 0.609375, + "rewards/verify_math_reward/std": 0.48816296458244324, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.002737827966484474, + "clip_ratio/high_mean": 0.001159976873168489, + "clip_ratio/low_mean": 0.0006855983087916684, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018455751924193464, + "epoch": 3.041994750656168, + "grad_norm": 0.28312474489212036, + "learning_rate": 1e-06, + "loss": -0.083, + "step": 1302 + }, + { + "clip_ratio/high_max": 0.003003618709044531, + "clip_ratio/high_mean": 0.001171711934148334, + "clip_ratio/low_mean": 0.0009604678125469945, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002132179753971286, + "epoch": 3.0443277923592884, + "grad_norm": 0.24512238800525665, + "learning_rate": 1e-06, + "loss": -0.0831, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.0028580743819475174, + "clip_ratio/high_mean": 0.0011533901815710124, + "clip_ratio/low_mean": 0.0010000799777571956, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002153470122721046, + "epoch": 3.046660834062409, + "grad_norm": 0.3053385615348816, + "learning_rate": 1e-06, + "loss": -0.0831, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0020921457835356705, + "clip_ratio/high_mean": 0.0007462162448064191, + "clip_ratio/low_mean": 0.0004180340970378893, + "clip_ratio/low_min": 1.9913972209906206e-05, + "clip_ratio/region_mean": 0.0011642503486655187, + "completions/clipped_ratio": 0.1785714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3833.0, + "completions/mean_length": 1298.3717041015625, + "completions/mean_terminated_length": 690.1915893554688, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 3.0489938757655293, + "grad_norm": 0.2806278169155121, + "learning_rate": 1e-06, + "loss": -0.058, + "num_tokens": 191988699.0, + "reward": 0.5792410969734192, + "reward_std": 0.1340659260749817, + "rewards/verify_math_reward/mean": 0.5792410969734192, + "rewards/verify_math_reward/std": 0.49395665526390076, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.002359187856200151, + "clip_ratio/high_mean": 0.0009155851730611175, + "clip_ratio/low_mean": 0.0005765044097643113, + "clip_ratio/low_min": 2.3773298380547203e-05, + "clip_ratio/region_mean": 0.0014920895664545242, + "epoch": 3.05132691746865, + "grad_norm": 0.28606978058815, + "learning_rate": 1e-06, + "loss": -0.0582, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.0025636116333771497, + "clip_ratio/high_mean": 0.0008925399761210429, + "clip_ratio/low_mean": 0.0007514707058362546, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001644010713789612, + "epoch": 3.05365995917177, + "grad_norm": 0.1945660561323166, + "learning_rate": 1e-06, + "loss": -0.0584, + "step": 1307 + }, + { + "clip_ratio/high_max": 0.0023899032203189563, + "clip_ratio/high_mean": 0.0008637774208182236, + "clip_ratio/low_mean": 0.0008972271461971104, + "clip_ratio/low_min": 1.9913972209906206e-05, + "clip_ratio/region_mean": 0.001761004503350705, + "epoch": 3.055993000874891, + "grad_norm": 0.20888541638851166, + "learning_rate": 1e-06, + "loss": -0.0585, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0028958607435924932, + "clip_ratio/high_mean": 0.0011245208734180778, + "clip_ratio/low_mean": 0.0005285132112931024, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016530340944882482, + "completions/clipped_ratio": 0.2321428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4034.0, + "completions/mean_length": 1454.505615234375, + "completions/mean_terminated_length": 655.9142456054688, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 3.058326042578011, + "grad_norm": 0.30097147822380066, + "learning_rate": 1e-06, + "loss": -0.0859, + "num_tokens": 192532720.0, + "reward": 0.5558035969734192, + "reward_std": 0.1601438969373703, + "rewards/verify_math_reward/mean": 0.5558035969734192, + "rewards/verify_math_reward/std": 0.49715372920036316, + "step": 1309 + }, + { + "clip_ratio/high_max": 0.003858191121253185, + "clip_ratio/high_mean": 0.0013613364571938291, + "clip_ratio/low_mean": 0.0007244455564432428, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020857820054516196, + "epoch": 3.0606590842811316, + "grad_norm": 0.29711002111434937, + "learning_rate": 1e-06, + "loss": -0.0862, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0034845851332647726, + "clip_ratio/high_mean": 0.0013190685895096976, + "clip_ratio/low_mean": 0.0008171836466317473, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021362522566050757, + "epoch": 3.062992125984252, + "grad_norm": 0.2635003328323364, + "learning_rate": 1e-06, + "loss": -0.0863, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.0035168484901078045, + "clip_ratio/high_mean": 0.001348177953332197, + "clip_ratio/low_mean": 0.0010513546640140703, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002399532611889299, + "epoch": 3.0653251676873725, + "grad_norm": 0.24283172190189362, + "learning_rate": 1e-06, + "loss": -0.0864, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0025021589008247247, + "clip_ratio/high_mean": 0.0010183860122197075, + "clip_ratio/low_mean": 0.0004949986587234889, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001513384671852691, + "completions/clipped_ratio": 0.1863839285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2602.0, + "completions/mean_length": 1248.216552734375, + "completions/mean_terminated_length": 595.8436279296875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 3.0676582093904927, + "grad_norm": 0.37111696600914, + "learning_rate": 1e-06, + "loss": -0.0602, + "num_tokens": 193057898.0, + "reward": 0.6037946939468384, + "reward_std": 0.1543617695569992, + "rewards/verify_math_reward/mean": 0.6037946343421936, + "rewards/verify_math_reward/std": 0.48938122391700745, + "step": 1313 + }, + { + "clip_ratio/high_max": 0.0034964400474564172, + "clip_ratio/high_mean": 0.001275072372663999, + "clip_ratio/low_mean": 0.0007646513663530641, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020397236621647608, + "epoch": 3.0699912510936134, + "grad_norm": 0.2912351191043854, + "learning_rate": 1e-06, + "loss": -0.0605, + "step": 1314 + }, + { + "clip_ratio/high_max": 0.003168079365423182, + "clip_ratio/high_mean": 0.0012253887607585057, + "clip_ratio/low_mean": 0.0009435755418962799, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021689642599085346, + "epoch": 3.0723242927967336, + "grad_norm": 0.2485773265361786, + "learning_rate": 1e-06, + "loss": -0.0607, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0031388264724228065, + "clip_ratio/high_mean": 0.0012513939327618573, + "clip_ratio/low_mean": 0.0010505451255085063, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023019390864646994, + "epoch": 3.0746573344998542, + "grad_norm": 0.26045605540275574, + "learning_rate": 1e-06, + "loss": -0.0607, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0022871091787237674, + "clip_ratio/high_mean": 0.000889468803507043, + "clip_ratio/low_mean": 0.0005702042794837325, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014596730943594594, + "completions/clipped_ratio": 0.1819196428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3486.0, + "completions/mean_length": 1329.7109375, + "completions/mean_terminated_length": 714.5607299804688, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 3.0769903762029744, + "grad_norm": 0.35755103826522827, + "learning_rate": 1e-06, + "loss": -0.075, + "num_tokens": 193677407.0, + "reward": 0.5011160969734192, + "reward_std": 0.15090125799179077, + "rewards/verify_math_reward/mean": 0.5011160969734192, + "rewards/verify_math_reward/std": 0.5002779960632324, + "step": 1317 + }, + { + "clip_ratio/high_max": 0.00302785503299674, + "clip_ratio/high_mean": 0.0011055794529966079, + "clip_ratio/low_mean": 0.0008301125717480318, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019356920092832297, + "epoch": 3.079323417906095, + "grad_norm": 0.23217631876468658, + "learning_rate": 1e-06, + "loss": -0.0752, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.0029139655307517387, + "clip_ratio/high_mean": 0.0010585647978587076, + "clip_ratio/low_mean": 0.0008228557717302465, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018814205541275442, + "epoch": 3.0816564596092153, + "grad_norm": 0.24049969017505646, + "learning_rate": 1e-06, + "loss": -0.0753, + "step": 1319 + }, + { + "clip_ratio/high_max": 0.002689016386284493, + "clip_ratio/high_mean": 0.0010127902824024204, + "clip_ratio/low_mean": 0.0010295547544956207, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00204234504053602, + "epoch": 3.083989501312336, + "grad_norm": 0.22247229516506195, + "learning_rate": 1e-06, + "loss": -0.0754, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0022223807318368927, + "clip_ratio/high_mean": 0.000926491069549229, + "clip_ratio/low_mean": 0.0005344799919839716, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014609710924560204, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3970.0, + "completions/mean_length": 1175.2913818359375, + "completions/mean_terminated_length": 634.4193115234375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 3.0863225430154566, + "grad_norm": 0.3505702614784241, + "learning_rate": 1e-06, + "loss": -0.0763, + "num_tokens": 194243804.0, + "reward": 0.637276828289032, + "reward_std": 0.16040420532226562, + "rewards/verify_math_reward/mean": 0.6372767686843872, + "rewards/verify_math_reward/std": 0.481054425239563, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.002552797508542426, + "clip_ratio/high_mean": 0.00113710213554441, + "clip_ratio/low_mean": 0.0007302283574972535, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00186733047303278, + "epoch": 3.088655584718577, + "grad_norm": 0.27257534861564636, + "learning_rate": 1e-06, + "loss": -0.0766, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.0024929726532718632, + "clip_ratio/high_mean": 0.0010018376469815848, + "clip_ratio/low_mean": 0.0008190456974261906, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00182088326982921, + "epoch": 3.0909886264216975, + "grad_norm": 0.2139657884836197, + "learning_rate": 1e-06, + "loss": -0.0767, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.0023928607697598636, + "clip_ratio/high_mean": 0.001074929530659574, + "clip_ratio/low_mean": 0.0009412316030648071, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020161611028015614, + "epoch": 3.0933216681248177, + "grad_norm": 0.21198685467243195, + "learning_rate": 1e-06, + "loss": -0.0768, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0020417028936208226, + "clip_ratio/high_mean": 0.0007766804992570542, + "clip_ratio/low_mean": 0.0005703795477529638, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013470600388245657, + "completions/clipped_ratio": 0.1517857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3445.0, + "completions/mean_length": 1186.0101318359375, + "completions/mean_terminated_length": 665.2750244140625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 3.0956547098279383, + "grad_norm": 0.2741696238517761, + "learning_rate": 1e-06, + "loss": -0.058, + "num_tokens": 194836629.0, + "reward": 0.5915178656578064, + "reward_std": 0.15567587316036224, + "rewards/verify_math_reward/mean": 0.5915178656578064, + "rewards/verify_math_reward/std": 0.49182769656181335, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.002530857287638355, + "clip_ratio/high_mean": 0.0010029010190919507, + "clip_ratio/low_mean": 0.0007089033119882515, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017118043251684867, + "epoch": 3.0979877515310585, + "grad_norm": 0.24755199253559113, + "learning_rate": 1e-06, + "loss": -0.0582, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.0025047990056918934, + "clip_ratio/high_mean": 0.0009640709140512627, + "clip_ratio/low_mean": 0.0007944907483761199, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001758561615133658, + "epoch": 3.100320793234179, + "grad_norm": 0.32930606603622437, + "learning_rate": 1e-06, + "loss": -0.0583, + "step": 1327 + }, + { + "clip_ratio/high_max": 0.0024294221111631487, + "clip_ratio/high_mean": 0.0008861384922056459, + "clip_ratio/low_mean": 0.0010210815198661294, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019072200229857117, + "epoch": 3.1026538349372994, + "grad_norm": 0.20770609378814697, + "learning_rate": 1e-06, + "loss": -0.0584, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0025223410448234063, + "clip_ratio/high_mean": 0.0008677193072799128, + "clip_ratio/low_mean": 0.0005741742888858425, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014418935825233348, + "completions/clipped_ratio": 0.1573660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2366.0, + "completions/mean_length": 1134.923095703125, + "completions/mean_terminated_length": 581.9271850585938, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 3.10498687664042, + "grad_norm": 0.3837740421295166, + "learning_rate": 1e-06, + "loss": -0.0606, + "num_tokens": 195371312.0, + "reward": 0.613839328289032, + "reward_std": 0.1515427529811859, + "rewards/verify_math_reward/mean": 0.6138392686843872, + "rewards/verify_math_reward/std": 0.48714008927345276, + "step": 1329 + }, + { + "clip_ratio/high_max": 0.0028084589139325544, + "clip_ratio/high_mean": 0.0010732359787652967, + "clip_ratio/low_mean": 0.0008945150748331798, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001967751064512413, + "epoch": 3.1073199183435403, + "grad_norm": 0.3349197208881378, + "learning_rate": 1e-06, + "loss": -0.0609, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0028585824693436734, + "clip_ratio/high_mean": 0.0010309245662938338, + "clip_ratio/low_mean": 0.0009570273650751915, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001987951938644983, + "epoch": 3.109652960046661, + "grad_norm": 0.2798576354980469, + "learning_rate": 1e-06, + "loss": -0.0611, + "step": 1331 + }, + { + "clip_ratio/high_max": 0.0028027815860696137, + "clip_ratio/high_mean": 0.0010115866825799458, + "clip_ratio/low_mean": 0.0010797360355354613, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002091322712658439, + "epoch": 3.111986001749781, + "grad_norm": 0.2401420623064041, + "learning_rate": 1e-06, + "loss": -0.0612, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.002997489478730131, + "clip_ratio/high_mean": 0.0010483501500857528, + "clip_ratio/low_mean": 0.0006875655117255519, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017359156845486723, + "completions/clipped_ratio": 0.2075892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4033.0, + "completions/mean_length": 1388.8985595703125, + "completions/mean_terminated_length": 679.7140502929688, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 3.114319043452902, + "grad_norm": 0.2810341417789459, + "learning_rate": 1e-06, + "loss": -0.0808, + "num_tokens": 195954741.0, + "reward": 0.5390625, + "reward_std": 0.148578941822052, + "rewards/verify_math_reward/mean": 0.5390625, + "rewards/verify_math_reward/std": 0.4987502098083496, + "step": 1333 + }, + { + "clip_ratio/high_max": 0.0032104247584356926, + "clip_ratio/high_mean": 0.0011600665020523593, + "clip_ratio/low_mean": 0.000777182036472368, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019372485403437167, + "epoch": 3.116652085156022, + "grad_norm": 0.26112329959869385, + "learning_rate": 1e-06, + "loss": -0.081, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.0038014510937500745, + "clip_ratio/high_mean": 0.0013046984822722152, + "clip_ratio/low_mean": 0.0011017159140465083, + "clip_ratio/low_min": 1.90200844372157e-05, + "clip_ratio/region_mean": 0.0024064143872237764, + "epoch": 3.1189851268591426, + "grad_norm": 0.22285813093185425, + "learning_rate": 1e-06, + "loss": -0.0812, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0032983868804876693, + "clip_ratio/high_mean": 0.0011769670236390084, + "clip_ratio/low_mean": 0.0011258692848059582, + "clip_ratio/low_min": 9.577076525602024e-06, + "clip_ratio/region_mean": 0.002302836270246189, + "epoch": 3.121318168562263, + "grad_norm": 0.22714033722877502, + "learning_rate": 1e-06, + "loss": -0.0812, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.002241811485873768, + "clip_ratio/high_mean": 0.000784858597398852, + "clip_ratio/low_mean": 0.0004849584456678713, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012698170139628928, + "completions/clipped_ratio": 0.1930803571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3163.0, + "completions/mean_length": 1253.9598388671875, + "completions/mean_terminated_length": 573.9142456054688, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 3.1236512102653835, + "grad_norm": 0.361855685710907, + "learning_rate": 1e-06, + "loss": -0.0635, + "num_tokens": 196451225.0, + "reward": 0.6305803656578064, + "reward_std": 0.13080225884914398, + "rewards/verify_math_reward/mean": 0.6305803656578064, + "rewards/verify_math_reward/std": 0.4829172194004059, + "step": 1337 + }, + { + "clip_ratio/high_max": 0.002304529039975023, + "clip_ratio/high_mean": 0.0008729561104701133, + "clip_ratio/low_mean": 0.0006413904757209821, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015143465643632226, + "epoch": 3.1259842519685037, + "grad_norm": 0.26827290654182434, + "learning_rate": 1e-06, + "loss": -0.0636, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.0027912609184568282, + "clip_ratio/high_mean": 0.0009762080953805707, + "clip_ratio/low_mean": 0.0008614428024884546, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018376508523942903, + "epoch": 3.1283172936716244, + "grad_norm": 0.21968616545200348, + "learning_rate": 1e-06, + "loss": -0.0637, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.0025011100333358627, + "clip_ratio/high_mean": 0.0008304880302603124, + "clip_ratio/low_mean": 0.0009597744519851403, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001790262478607474, + "epoch": 3.130650335374745, + "grad_norm": 0.22235487401485443, + "learning_rate": 1e-06, + "loss": -0.0639, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.002429226202366408, + "clip_ratio/high_mean": 0.0009180686429317575, + "clip_ratio/low_mean": 0.0007322976243813173, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016503662773175165, + "completions/clipped_ratio": 0.1640625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3774.0, + "completions/mean_length": 1257.1138916015625, + "completions/mean_terminated_length": 699.9492797851562, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 3.1329833770778652, + "grad_norm": 0.33593234419822693, + "learning_rate": 1e-06, + "loss": -0.0803, + "num_tokens": 197065823.0, + "reward": 0.5133928656578064, + "reward_std": 0.1994110643863678, + "rewards/verify_math_reward/mean": 0.5133928656578064, + "rewards/verify_math_reward/std": 0.500099778175354, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.0027345400958438404, + "clip_ratio/high_mean": 0.001094697949156398, + "clip_ratio/low_mean": 0.0009638746232667472, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002058572594251018, + "epoch": 3.135316418780986, + "grad_norm": 0.3292315602302551, + "learning_rate": 1e-06, + "loss": -0.0805, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.003092270919296425, + "clip_ratio/high_mean": 0.0011939085452468134, + "clip_ratio/low_mean": 0.0012318274784774985, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002425736100121867, + "epoch": 3.137649460484106, + "grad_norm": 0.24277986586093903, + "learning_rate": 1e-06, + "loss": -0.0807, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.003047489677555859, + "clip_ratio/high_mean": 0.00112897347935359, + "clip_ratio/low_mean": 0.0013810614182148129, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0025100349012063816, + "epoch": 3.1399825021872267, + "grad_norm": 0.22549711167812347, + "learning_rate": 1e-06, + "loss": -0.0808, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.002825524723448325, + "clip_ratio/high_mean": 0.0009551564926368883, + "clip_ratio/low_mean": 0.0005220908387855161, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014772473223274574, + "completions/clipped_ratio": 0.1618303571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2880.0, + "completions/mean_length": 1180.544677734375, + "completions/mean_terminated_length": 617.6404418945312, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 3.142315543890347, + "grad_norm": 0.3498491644859314, + "learning_rate": 1e-06, + "loss": -0.0639, + "num_tokens": 197616975.0, + "reward": 0.6238839626312256, + "reward_std": 0.13414262235164642, + "rewards/verify_math_reward/mean": 0.6238839030265808, + "rewards/verify_math_reward/std": 0.4846802353858948, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.002978554868604988, + "clip_ratio/high_mean": 0.0010518930212128907, + "clip_ratio/low_mean": 0.0006826523404015461, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001734545348881511, + "epoch": 3.1446485855934676, + "grad_norm": 0.31992146372795105, + "learning_rate": 1e-06, + "loss": -0.0641, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.0032743963893153705, + "clip_ratio/high_mean": 0.0010800384643516736, + "clip_ratio/low_mean": 0.0009262666280847043, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020063050906173885, + "epoch": 3.146981627296588, + "grad_norm": 0.28409144282341003, + "learning_rate": 1e-06, + "loss": -0.0643, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.00318326481647091, + "clip_ratio/high_mean": 0.0010985936642100569, + "clip_ratio/low_mean": 0.0011267528734606458, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002225346521299798, + "epoch": 3.1493146689997085, + "grad_norm": 0.233980193734169, + "learning_rate": 1e-06, + "loss": -0.0644, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.002139932978025172, + "clip_ratio/high_mean": 0.0009107685109484009, + "clip_ratio/low_mean": 0.000569330952657765, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014800994686083868, + "completions/clipped_ratio": 0.2287946428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3305.0, + "completions/mean_length": 1480.96435546875, + "completions/mean_terminated_length": 705.15771484375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 3.1516477107028287, + "grad_norm": 0.3200514018535614, + "learning_rate": 1e-06, + "loss": -0.0999, + "num_tokens": 198218823.0, + "reward": 0.4810267984867096, + "reward_std": 0.15747570991516113, + "rewards/verify_math_reward/mean": 0.4810267984867096, + "rewards/verify_math_reward/std": 0.49991899728775024, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.003589366293454077, + "clip_ratio/high_mean": 0.0013790113334835041, + "clip_ratio/low_mean": 0.0007359550145338289, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002114966333465418, + "epoch": 3.1539807524059493, + "grad_norm": 0.2880544364452362, + "learning_rate": 1e-06, + "loss": -0.1002, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0030352926878549624, + "clip_ratio/high_mean": 0.0012039461980748456, + "clip_ratio/low_mean": 0.0009654550049162935, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021694011993531603, + "epoch": 3.1563137941090695, + "grad_norm": 0.22166696190834045, + "learning_rate": 1e-06, + "loss": -0.1003, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.0034358482298557647, + "clip_ratio/high_mean": 0.0012475168659875635, + "clip_ratio/low_mean": 0.0010006393131334335, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022481561463791877, + "epoch": 3.15864683581219, + "grad_norm": 0.3134857714176178, + "learning_rate": 1e-06, + "loss": -0.1004, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0018772271469060797, + "clip_ratio/high_mean": 0.0006525577919092029, + "clip_ratio/low_mean": 0.0005370141607272672, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001189571961731417, + "completions/clipped_ratio": 0.1540178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3762.0, + "completions/mean_length": 1193.2578125, + "completions/mean_terminated_length": 664.790283203125, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 3.1609798775153104, + "grad_norm": 0.2849535048007965, + "learning_rate": 1e-06, + "loss": -0.0671, + "num_tokens": 198815262.0, + "reward": 0.5714285969734192, + "reward_std": 0.14327509701251984, + "rewards/verify_math_reward/mean": 0.5714285969734192, + "rewards/verify_math_reward/std": 0.49514806270599365, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.0023393456867779605, + "clip_ratio/high_mean": 0.0008391679275518982, + "clip_ratio/low_mean": 0.0008357585793419275, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001674926512350794, + "epoch": 3.163312919218431, + "grad_norm": 0.20166900753974915, + "learning_rate": 1e-06, + "loss": -0.0674, + "step": 1354 + }, + { + "clip_ratio/high_max": 0.002438125266053248, + "clip_ratio/high_mean": 0.0008892816931620473, + "clip_ratio/low_mean": 0.0008286072243208764, + "clip_ratio/low_min": 7.640586773050018e-06, + "clip_ratio/region_mean": 0.0017178889029310085, + "epoch": 3.1656459609215517, + "grad_norm": 0.22612257301807404, + "learning_rate": 1e-06, + "loss": -0.0674, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0021777985348307993, + "clip_ratio/high_mean": 0.0007701308604737278, + "clip_ratio/low_mean": 0.001180693635433272, + "clip_ratio/low_min": 2.2921760319150053e-05, + "clip_ratio/region_mean": 0.0019508245386532508, + "epoch": 3.167979002624672, + "grad_norm": 0.19750025868415833, + "learning_rate": 1e-06, + "loss": -0.0676, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0021138735828571953, + "clip_ratio/high_mean": 0.0008293652481370373, + "clip_ratio/low_mean": 0.0005782775979241705, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014076428415137343, + "completions/clipped_ratio": 0.1852678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3315.0, + "completions/mean_length": 1272.3973388671875, + "completions/mean_terminated_length": 630.3178100585938, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 3.1703120443277926, + "grad_norm": 0.3227970600128174, + "learning_rate": 1e-06, + "loss": -0.0737, + "num_tokens": 199360290.0, + "reward": 0.5736607313156128, + "reward_std": 0.148612841963768, + "rewards/verify_math_reward/mean": 0.5736607313156128, + "rewards/verify_math_reward/std": 0.4948205351829529, + "step": 1357 + }, + { + "clip_ratio/high_max": 0.0030250952040660195, + "clip_ratio/high_mean": 0.0011142691564600682, + "clip_ratio/low_mean": 0.0007781955441714672, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018924646901723463, + "epoch": 3.1726450860309128, + "grad_norm": 0.246768981218338, + "learning_rate": 1e-06, + "loss": -0.0739, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.0028133710147812963, + "clip_ratio/high_mean": 0.001045112068823073, + "clip_ratio/low_mean": 0.000986219386504672, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002031331448961282, + "epoch": 3.1749781277340334, + "grad_norm": 0.2945663034915924, + "learning_rate": 1e-06, + "loss": -0.0742, + "step": 1359 + }, + { + "clip_ratio/high_max": 0.0024650287450640462, + "clip_ratio/high_mean": 0.0009208102055708878, + "clip_ratio/low_mean": 0.0010797965260280762, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00200060671340907, + "epoch": 3.1773111694371536, + "grad_norm": 0.22011803090572357, + "learning_rate": 1e-06, + "loss": -0.0742, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0028081150812795386, + "clip_ratio/high_mean": 0.0010802435426739976, + "clip_ratio/low_mean": 0.0005825233292853227, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016627668555884156, + "completions/clipped_ratio": 0.1908482142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3346.0, + "completions/mean_length": 1339.078125, + "completions/mean_terminated_length": 688.8248291015625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 3.1796442111402743, + "grad_norm": 0.3312619924545288, + "learning_rate": 1e-06, + "loss": -0.1132, + "num_tokens": 199942360.0, + "reward": 0.520089328289032, + "reward_std": 0.1837480217218399, + "rewards/verify_math_reward/mean": 0.5200892686843872, + "rewards/verify_math_reward/std": 0.4998753070831299, + "step": 1361 + }, + { + "clip_ratio/high_max": 0.0037408599819173105, + "clip_ratio/high_mean": 0.001404201702825958, + "clip_ratio/low_mean": 0.0008379365717701148, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022421382745960727, + "epoch": 3.1819772528433945, + "grad_norm": 0.29121676087379456, + "learning_rate": 1e-06, + "loss": -0.1135, + "step": 1362 + }, + { + "clip_ratio/high_max": 0.0035552595218177885, + "clip_ratio/high_mean": 0.001331019233475672, + "clip_ratio/low_mean": 0.0009413602620043093, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002272379490023013, + "epoch": 3.184310294546515, + "grad_norm": 0.24848061800003052, + "learning_rate": 1e-06, + "loss": -0.1137, + "step": 1363 + }, + { + "clip_ratio/high_max": 0.003499184087559115, + "clip_ratio/high_mean": 0.0012648745578189846, + "clip_ratio/low_mean": 0.001018966988340253, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022838415825390257, + "epoch": 3.1866433362496354, + "grad_norm": 0.2734673321247101, + "learning_rate": 1e-06, + "loss": -0.1138, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0033895653759827837, + "clip_ratio/high_mean": 0.001157745202363003, + "clip_ratio/low_mean": 0.0006268017434649664, + "clip_ratio/low_min": 9.797773600439541e-06, + "clip_ratio/region_mean": 0.001784546908311313, + "completions/clipped_ratio": 0.2142857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3118.0, + "completions/mean_length": 1455.985595703125, + "completions/mean_terminated_length": 735.9815673828125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 3.188976377952756, + "grad_norm": 0.3164910078048706, + "learning_rate": 1e-06, + "loss": -0.1014, + "num_tokens": 200553963.0, + "reward": 0.504464328289032, + "reward_std": 0.18614476919174194, + "rewards/verify_math_reward/mean": 0.5044642686843872, + "rewards/verify_math_reward/std": 0.5002593398094177, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0032233367674052715, + "clip_ratio/high_mean": 0.0011809509014710784, + "clip_ratio/low_mean": 0.0009057749102794332, + "clip_ratio/low_min": 1.9595547200879082e-05, + "clip_ratio/region_mean": 0.00208672575536184, + "epoch": 3.1913094196558762, + "grad_norm": 0.28135693073272705, + "learning_rate": 1e-06, + "loss": -0.1016, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.0033793382317526266, + "clip_ratio/high_mean": 0.0012781320911017247, + "clip_ratio/low_mean": 0.0011014879482900142, + "clip_ratio/low_min": 3.9191094401758164e-05, + "clip_ratio/region_mean": 0.002379620047577191, + "epoch": 3.193642461358997, + "grad_norm": 0.23779508471488953, + "learning_rate": 1e-06, + "loss": -0.1018, + "step": 1367 + }, + { + "clip_ratio/high_max": 0.0032954135313048027, + "clip_ratio/high_mean": 0.0012228189552843105, + "clip_ratio/low_mean": 0.00129534852476354, + "clip_ratio/low_min": 1.9595547200879082e-05, + "clip_ratio/region_mean": 0.002518167566449847, + "epoch": 3.195975503062117, + "grad_norm": 0.24185238778591156, + "learning_rate": 1e-06, + "loss": -0.1019, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0022182962638908066, + "clip_ratio/high_mean": 0.0008027746625884902, + "clip_ratio/low_mean": 0.0006631616515733185, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001465936285967473, + "completions/clipped_ratio": 0.2008928571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3785.0, + "completions/mean_length": 1357.7545166015625, + "completions/mean_terminated_length": 669.3687133789062, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 3.1983085447652377, + "grad_norm": 0.30065393447875977, + "learning_rate": 1e-06, + "loss": -0.0765, + "num_tokens": 201119335.0, + "reward": 0.5546875, + "reward_std": 0.14902400970458984, + "rewards/verify_math_reward/mean": 0.5546875, + "rewards/verify_math_reward/std": 0.4972778558731079, + "step": 1369 + }, + { + "clip_ratio/high_max": 0.002445177990011871, + "clip_ratio/high_mean": 0.0009415408603672404, + "clip_ratio/low_mean": 0.0008758692583796801, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00181741012056591, + "epoch": 3.200641586468358, + "grad_norm": 0.32187435030937195, + "learning_rate": 1e-06, + "loss": -0.0765, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0026672029853216372, + "clip_ratio/high_mean": 0.0010266938543281867, + "clip_ratio/low_mean": 0.001034640639772988, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020613344968296587, + "epoch": 3.2029746281714786, + "grad_norm": 0.26315632462501526, + "learning_rate": 1e-06, + "loss": -0.0768, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.0024598382879048586, + "clip_ratio/high_mean": 0.001008868899589288, + "clip_ratio/low_mean": 0.0014022009199834429, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024110698213917203, + "epoch": 3.205307669874599, + "grad_norm": 0.23852814733982086, + "learning_rate": 1e-06, + "loss": -0.0769, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0026599104603519663, + "clip_ratio/high_mean": 0.001088276407244848, + "clip_ratio/low_mean": 0.00048437131954415236, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015726477358839475, + "completions/clipped_ratio": 0.2120535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3145.0, + "completions/mean_length": 1406.2344970703125, + "completions/mean_terminated_length": 682.3597412109375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 3.2076407115777195, + "grad_norm": 0.3115938603878021, + "learning_rate": 1e-06, + "loss": -0.0721, + "num_tokens": 201689641.0, + "reward": 0.5022321939468384, + "reward_std": 0.16660960018634796, + "rewards/verify_math_reward/mean": 0.5022321343421936, + "rewards/verify_math_reward/std": 0.5002743005752563, + "step": 1373 + }, + { + "clip_ratio/high_max": 0.002917425306804944, + "clip_ratio/high_mean": 0.0012557908485177904, + "clip_ratio/low_mean": 0.0006526048437081045, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019083957158727571, + "epoch": 3.20997375328084, + "grad_norm": 0.33406734466552734, + "learning_rate": 1e-06, + "loss": -0.0724, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.00291656961053377, + "clip_ratio/high_mean": 0.001195302145788446, + "clip_ratio/low_mean": 0.0007796578788656916, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019749599960050546, + "epoch": 3.2123067949839603, + "grad_norm": 0.26957711577415466, + "learning_rate": 1e-06, + "loss": -0.0725, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0030238851468311623, + "clip_ratio/high_mean": 0.001208049950946588, + "clip_ratio/low_mean": 0.0008903388452381478, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002098388780723326, + "epoch": 3.214639836687081, + "grad_norm": 0.25391343235969543, + "learning_rate": 1e-06, + "loss": -0.0725, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0023889686999609694, + "clip_ratio/high_mean": 0.000987817853456363, + "clip_ratio/low_mean": 0.0006864479155410663, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016742657790018711, + "completions/clipped_ratio": 0.2209821428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3866.0, + "completions/mean_length": 1485.3751220703125, + "completions/mean_terminated_length": 744.8252563476562, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "epoch": 3.216972878390201, + "grad_norm": 0.2967849373817444, + "learning_rate": 1e-06, + "loss": -0.098, + "num_tokens": 202304313.0, + "reward": 0.4765625298023224, + "reward_std": 0.1740477830171585, + "rewards/verify_math_reward/mean": 0.4765625, + "rewards/verify_math_reward/std": 0.49972933530807495, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.0024445352246402763, + "clip_ratio/high_mean": 0.0010376444915891625, + "clip_ratio/low_mean": 0.0009159694309346378, + "clip_ratio/low_min": 1.9647908629849553e-05, + "clip_ratio/region_mean": 0.001953613871592097, + "epoch": 3.219305920093322, + "grad_norm": 0.30563458800315857, + "learning_rate": 1e-06, + "loss": -0.0981, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.0023477618451579474, + "clip_ratio/high_mean": 0.0010650950243871193, + "clip_ratio/low_mean": 0.0010782867429952603, + "clip_ratio/low_min": 1.60503332153894e-05, + "clip_ratio/region_mean": 0.0021433818037621677, + "epoch": 3.221638961796442, + "grad_norm": 0.263799786567688, + "learning_rate": 1e-06, + "loss": -0.0984, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.002721808406931814, + "clip_ratio/high_mean": 0.0011329717963235453, + "clip_ratio/low_mean": 0.0011568615827854956, + "clip_ratio/low_min": 5.0730519433273e-05, + "clip_ratio/region_mean": 0.0022898333336343057, + "epoch": 3.2239720034995627, + "grad_norm": 0.23858009278774261, + "learning_rate": 1e-06, + "loss": -0.0985, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0021201352865318768, + "clip_ratio/high_mean": 0.000750652090573567, + "clip_ratio/low_mean": 0.0005253635126791778, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001276015624171123, + "completions/clipped_ratio": 0.1696428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3517.0, + "completions/mean_length": 1212.1685791015625, + "completions/mean_terminated_length": 622.9986572265625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 3.226305045202683, + "grad_norm": 0.316988080739975, + "learning_rate": 1e-06, + "loss": -0.0555, + "num_tokens": 202851776.0, + "reward": 0.621651828289032, + "reward_std": 0.1516508162021637, + "rewards/verify_math_reward/mean": 0.6216517686843872, + "rewards/verify_math_reward/std": 0.4852459728717804, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.0027541725648916326, + "clip_ratio/high_mean": 0.0009396336899953894, + "clip_ratio/low_mean": 0.0006619510313612409, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016015847795642912, + "epoch": 3.2286380869058036, + "grad_norm": 0.25705021619796753, + "learning_rate": 1e-06, + "loss": -0.0557, + "step": 1382 + }, + { + "clip_ratio/high_max": 0.002651364127814304, + "clip_ratio/high_mean": 0.000985535503787105, + "clip_ratio/low_mean": 0.0008871033442119369, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018726388880168088, + "epoch": 3.2309711286089238, + "grad_norm": 0.31974393129348755, + "learning_rate": 1e-06, + "loss": -0.0558, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.0023458923678845167, + "clip_ratio/high_mean": 0.0008305495575768873, + "clip_ratio/low_mean": 0.0010812032996909693, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019117528572678566, + "epoch": 3.2333041703120444, + "grad_norm": 0.2726139724254608, + "learning_rate": 1e-06, + "loss": -0.0558, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0024199914041673765, + "clip_ratio/high_mean": 0.0009865360079857055, + "clip_ratio/low_mean": 0.0006013246538714156, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015878607009653933, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3653.0, + "completions/mean_length": 1295.06591796875, + "completions/mean_terminated_length": 648.6964721679688, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 3.2356372120151646, + "grad_norm": 0.31969305872917175, + "learning_rate": 1e-06, + "loss": -0.0838, + "num_tokens": 203415259.0, + "reward": 0.5569196939468384, + "reward_std": 0.15097934007644653, + "rewards/verify_math_reward/mean": 0.5569196343421936, + "rewards/verify_math_reward/std": 0.49702703952789307, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0032785937219159678, + "clip_ratio/high_mean": 0.0013131868727214169, + "clip_ratio/low_mean": 0.000833667117603909, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021468539998750202, + "epoch": 3.2379702537182853, + "grad_norm": 0.25600868463516235, + "learning_rate": 1e-06, + "loss": -0.084, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.002963113114674343, + "clip_ratio/high_mean": 0.0012603523737197975, + "clip_ratio/low_mean": 0.0008865321888151811, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002146884515241254, + "epoch": 3.2403032954214055, + "grad_norm": 0.22946487367153168, + "learning_rate": 1e-06, + "loss": -0.0842, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.002728587744059041, + "clip_ratio/high_mean": 0.001143316458183108, + "clip_ratio/low_mean": 0.0010875751013372792, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002230891514045652, + "epoch": 3.242636337124526, + "grad_norm": 0.20421616733074188, + "learning_rate": 1e-06, + "loss": -0.0842, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0031228039442794397, + "clip_ratio/high_mean": 0.0011625596416706685, + "clip_ratio/low_mean": 0.000449032223514223, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016115919133881107, + "completions/clipped_ratio": 0.1361607142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3356.0, + "completions/mean_length": 1118.03466796875, + "completions/mean_terminated_length": 648.6395263671875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 3.2449693788276464, + "grad_norm": 0.3249204456806183, + "learning_rate": 1e-06, + "loss": -0.0685, + "num_tokens": 204008506.0, + "reward": 0.5390625, + "reward_std": 0.18876947462558746, + "rewards/verify_math_reward/mean": 0.5390625, + "rewards/verify_math_reward/std": 0.4987502098083496, + "step": 1389 + }, + { + "clip_ratio/high_max": 0.0030883293802617118, + "clip_ratio/high_mean": 0.0012925277842441574, + "clip_ratio/low_mean": 0.0007287934840860544, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002021321233769413, + "epoch": 3.247302420530767, + "grad_norm": 0.27527910470962524, + "learning_rate": 1e-06, + "loss": -0.0688, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0035688294083229266, + "clip_ratio/high_mean": 0.0013115837173245382, + "clip_ratio/low_mean": 0.000894385588253499, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002205969278293196, + "epoch": 3.249635462233887, + "grad_norm": 0.22996072471141815, + "learning_rate": 1e-06, + "loss": -0.0689, + "step": 1391 + }, + { + "clip_ratio/high_max": 0.0032063006656244397, + "clip_ratio/high_mean": 0.0012513813744590152, + "clip_ratio/low_mean": 0.0010004925225075567, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022518738987855613, + "epoch": 3.251968503937008, + "grad_norm": 0.2599335014820099, + "learning_rate": 1e-06, + "loss": -0.069, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0017107321291405242, + "clip_ratio/high_mean": 0.0005945113161942572, + "clip_ratio/low_mean": 0.0004417756699695019, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010362869943492115, + "completions/clipped_ratio": 0.2522321428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3981.0, + "completions/mean_length": 1552.5357666015625, + "completions/mean_terminated_length": 694.591064453125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 3.2543015456401285, + "grad_norm": 0.2847426235675812, + "learning_rate": 1e-06, + "loss": -0.0712, + "num_tokens": 204568730.0, + "reward": 0.4676339626312256, + "reward_std": 0.12531113624572754, + "rewards/verify_math_reward/mean": 0.4676339328289032, + "rewards/verify_math_reward/std": 0.4992299973964691, + "step": 1393 + }, + { + "clip_ratio/high_max": 0.0022250008332775906, + "clip_ratio/high_mean": 0.0007966230305100908, + "clip_ratio/low_mean": 0.0006261913163143618, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014228143299988005, + "epoch": 3.2566345873432487, + "grad_norm": 0.25943008065223694, + "learning_rate": 1e-06, + "loss": -0.0715, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.0021694751958420966, + "clip_ratio/high_mean": 0.0007335646751016611, + "clip_ratio/low_mean": 0.0007502906291847466, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014838552997389343, + "epoch": 3.2589676290463694, + "grad_norm": 0.22387363016605377, + "learning_rate": 1e-06, + "loss": -0.0715, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.00225412361578492, + "clip_ratio/high_mean": 0.0006936196227798064, + "clip_ratio/low_mean": 0.0008708087239028828, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015644283539586468, + "epoch": 3.2613006707494896, + "grad_norm": 0.22405408322811127, + "learning_rate": 1e-06, + "loss": -0.0716, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.002606538255349733, + "clip_ratio/high_mean": 0.0008888039737939835, + "clip_ratio/low_mean": 0.0007069700986903626, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015957740324665792, + "completions/clipped_ratio": 0.1618303571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3575.0, + "completions/mean_length": 1223.63623046875, + "completions/mean_terminated_length": 669.0518798828125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 3.2636337124526102, + "grad_norm": 0.3812796473503113, + "learning_rate": 1e-06, + "loss": -0.0572, + "num_tokens": 205163652.0, + "reward": 0.515625, + "reward_std": 0.16153399646282196, + "rewards/verify_math_reward/mean": 0.515625, + "rewards/verify_math_reward/std": 0.5000349283218384, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.0026634673340595327, + "clip_ratio/high_mean": 0.001081415710359579, + "clip_ratio/low_mean": 0.0009601121455489192, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002041527899564244, + "epoch": 3.2659667541557305, + "grad_norm": 0.31780505180358887, + "learning_rate": 1e-06, + "loss": -0.0574, + "step": 1398 + }, + { + "clip_ratio/high_max": 0.0027923797897528857, + "clip_ratio/high_mean": 0.0010067439689009916, + "clip_ratio/low_mean": 0.001261196823179489, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002267940857564099, + "epoch": 3.268299795858851, + "grad_norm": 0.28611859679222107, + "learning_rate": 1e-06, + "loss": -0.0576, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.002342934902117122, + "clip_ratio/high_mean": 0.0009197864255838795, + "clip_ratio/low_mean": 0.0013291147406562231, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002248901131679304, + "epoch": 3.2706328375619713, + "grad_norm": 0.232257679104805, + "learning_rate": 1e-06, + "loss": -0.0577, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0023585003764310386, + "clip_ratio/high_mean": 0.000794408446381567, + "clip_ratio/low_mean": 0.0005302667259456939, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013246751987026073, + "completions/clipped_ratio": 0.1852678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2986.0, + "completions/mean_length": 1287.828125, + "completions/mean_terminated_length": 649.257568359375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 3.272965879265092, + "grad_norm": 0.3569932281970978, + "learning_rate": 1e-06, + "loss": -0.0867, + "num_tokens": 205724266.0, + "reward": 0.4988839626312256, + "reward_std": 0.13985875248908997, + "rewards/verify_math_reward/mean": 0.4988839328289032, + "rewards/verify_math_reward/std": 0.5002779960632324, + "step": 1401 + }, + { + "clip_ratio/high_max": 0.003075787186389789, + "clip_ratio/high_mean": 0.0009645216541684931, + "clip_ratio/low_mean": 0.0008519003768014954, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018164220455219038, + "epoch": 3.275298920968212, + "grad_norm": 0.29467490315437317, + "learning_rate": 1e-06, + "loss": -0.087, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.003196276338712778, + "clip_ratio/high_mean": 0.0009206524755427381, + "clip_ratio/low_mean": 0.0010368306466261856, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019574831567297224, + "epoch": 3.277631962671333, + "grad_norm": 0.2431877851486206, + "learning_rate": 1e-06, + "loss": -0.0872, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.0028092034262954257, + "clip_ratio/high_mean": 0.0009348743496957468, + "clip_ratio/low_mean": 0.0012063762533216504, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021412506757769734, + "epoch": 3.279965004374453, + "grad_norm": 0.22555813193321228, + "learning_rate": 1e-06, + "loss": -0.0873, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.002152534943888895, + "clip_ratio/high_mean": 0.0008175220791599713, + "clip_ratio/low_mean": 0.0006683234432784957, + "clip_ratio/low_min": 7.524680768256076e-06, + "clip_ratio/region_mean": 0.0014858455178909935, + "completions/clipped_ratio": 0.2198660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4090.0, + "completions/mean_length": 1495.5726318359375, + "completions/mean_terminated_length": 762.6909790039062, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 3.2822980460775737, + "grad_norm": 0.29808303713798523, + "learning_rate": 1e-06, + "loss": -0.1224, + "num_tokens": 206352035.0, + "reward": 0.4966517984867096, + "reward_std": 0.18396486341953278, + "rewards/verify_math_reward/mean": 0.4966517984867096, + "rewards/verify_math_reward/std": 0.5002680420875549, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.002606508096505422, + "clip_ratio/high_mean": 0.0010587483557173982, + "clip_ratio/low_mean": 0.0008699759018782061, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019287242466816679, + "epoch": 3.284631087780694, + "grad_norm": 0.2458135187625885, + "learning_rate": 1e-06, + "loss": -0.1226, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.00274065002304269, + "clip_ratio/high_mean": 0.0010915798920905218, + "clip_ratio/low_mean": 0.0010510491247259779, + "clip_ratio/low_min": 1.3536929145629983e-05, + "clip_ratio/region_mean": 0.002142629018635489, + "epoch": 3.2869641294838146, + "grad_norm": 0.2649960219860077, + "learning_rate": 1e-06, + "loss": -0.1227, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.00254956914432114, + "clip_ratio/high_mean": 0.0010358562685723882, + "clip_ratio/low_mean": 0.0011377867303963285, + "clip_ratio/low_min": 1.3536929145629983e-05, + "clip_ratio/region_mean": 0.0021736430426244624, + "epoch": 3.289297171186935, + "grad_norm": 0.22067445516586304, + "learning_rate": 1e-06, + "loss": -0.1228, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.002485864573827712, + "clip_ratio/high_mean": 0.0009715916239656508, + "clip_ratio/low_mean": 0.0006173867241159314, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015889783389866352, + "completions/clipped_ratio": 0.1975446428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3935.0, + "completions/mean_length": 1307.6663818359375, + "completions/mean_terminated_length": 621.24755859375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 3.2916302128900554, + "grad_norm": 0.4216117858886719, + "learning_rate": 1e-06, + "loss": -0.0581, + "num_tokens": 206892232.0, + "reward": 0.5803571939468384, + "reward_std": 0.14166000485420227, + "rewards/verify_math_reward/mean": 0.5803571343421936, + "rewards/verify_math_reward/std": 0.4937761127948761, + "step": 1409 + }, + { + "clip_ratio/high_max": 0.0030081197182880715, + "clip_ratio/high_mean": 0.0011857901899929857, + "clip_ratio/low_mean": 0.0008584561774114263, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020442463573999703, + "epoch": 3.2939632545931756, + "grad_norm": 0.2833123505115509, + "learning_rate": 1e-06, + "loss": -0.0584, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0028013972550979815, + "clip_ratio/high_mean": 0.001152902543253731, + "clip_ratio/low_mean": 0.0011308564498904161, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022837589713162743, + "epoch": 3.2962962962962963, + "grad_norm": 0.28059399127960205, + "learning_rate": 1e-06, + "loss": -0.0587, + "step": 1411 + }, + { + "clip_ratio/high_max": 0.0031393519675475545, + "clip_ratio/high_mean": 0.0010532768737903098, + "clip_ratio/low_mean": 0.0012799748437828384, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002333251715754159, + "epoch": 3.298629337999417, + "grad_norm": 0.3258727490901947, + "learning_rate": 1e-06, + "loss": -0.0587, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0029762344929622486, + "clip_ratio/high_mean": 0.0011315756164549384, + "clip_ratio/low_mean": 0.000596388447775098, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017279640451306477, + "completions/clipped_ratio": 0.1986607142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3880.0, + "completions/mean_length": 1383.8360595703125, + "completions/mean_terminated_length": 711.4609985351562, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 3.300962379702537, + "grad_norm": 0.3155386447906494, + "learning_rate": 1e-06, + "loss": -0.0868, + "num_tokens": 207493981.0, + "reward": 0.4899553656578064, + "reward_std": 0.1817513257265091, + "rewards/verify_math_reward/mean": 0.4899553656578064, + "rewards/verify_math_reward/std": 0.5001782774925232, + "step": 1413 + }, + { + "clip_ratio/high_max": 0.003695404317113571, + "clip_ratio/high_mean": 0.0013442114504869096, + "clip_ratio/low_mean": 0.0008084241362666944, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021526355121750385, + "epoch": 3.303295421405658, + "grad_norm": 0.3200008273124695, + "learning_rate": 1e-06, + "loss": -0.087, + "step": 1414 + }, + { + "clip_ratio/high_max": 0.00346468755742535, + "clip_ratio/high_mean": 0.0013513403246179223, + "clip_ratio/low_mean": 0.000933639259528718, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022849796223454177, + "epoch": 3.305628463108778, + "grad_norm": 0.24863432347774506, + "learning_rate": 1e-06, + "loss": -0.0872, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0036600856838049367, + "clip_ratio/high_mean": 0.0013390149251790717, + "clip_ratio/low_mean": 0.0011202372443221975, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0024592521294835024, + "epoch": 3.3079615048118987, + "grad_norm": 0.23993004858493805, + "learning_rate": 1e-06, + "loss": -0.0872, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0019119550051982515, + "clip_ratio/high_mean": 0.0006607695031561889, + "clip_ratio/low_mean": 0.0004249778171470098, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00108574733894784, + "completions/clipped_ratio": 0.1417410714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3274.0, + "completions/mean_length": 1082.376220703125, + "completions/mean_terminated_length": 584.677490234375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 3.310294546515019, + "grad_norm": 0.30247485637664795, + "learning_rate": 1e-06, + "loss": -0.0402, + "num_tokens": 208020966.0, + "reward": 0.6361607313156128, + "reward_std": 0.12267616391181946, + "rewards/verify_math_reward/mean": 0.6361607313156128, + "rewards/verify_math_reward/std": 0.4813718795776367, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.002196392902988009, + "clip_ratio/high_mean": 0.0008674347182022757, + "clip_ratio/low_mean": 0.0006404634668797371, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00150789816325414, + "epoch": 3.3126275882181395, + "grad_norm": 0.27442285418510437, + "learning_rate": 1e-06, + "loss": -0.0405, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.0025767409315449186, + "clip_ratio/high_mean": 0.0009292753056797665, + "clip_ratio/low_mean": 0.0008339601104125904, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017632354065426625, + "epoch": 3.3149606299212597, + "grad_norm": 0.2677547037601471, + "learning_rate": 1e-06, + "loss": -0.0406, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.0021459552117448766, + "clip_ratio/high_mean": 0.0007863909440857242, + "clip_ratio/low_mean": 0.0009221609288943, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017085518957173917, + "epoch": 3.3172936716243804, + "grad_norm": 0.28091907501220703, + "learning_rate": 1e-06, + "loss": -0.0406, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0026617609764798544, + "clip_ratio/high_mean": 0.001080154237570241, + "clip_ratio/low_mean": 0.0006827125680501922, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017628667847020552, + "completions/clipped_ratio": 0.1573660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2586.0, + "completions/mean_length": 1220.9085693359375, + "completions/mean_terminated_length": 683.9708862304688, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 3.3196267133275006, + "grad_norm": 0.31796377897262573, + "learning_rate": 1e-06, + "loss": -0.0813, + "num_tokens": 208628124.0, + "reward": 0.5859375, + "reward_std": 0.1989564299583435, + "rewards/verify_math_reward/mean": 0.5859375, + "rewards/verify_math_reward/std": 0.4928344786167145, + "step": 1421 + }, + { + "clip_ratio/high_max": 0.003174099554598797, + "clip_ratio/high_mean": 0.001284394380490994, + "clip_ratio/low_mean": 0.000976650461780082, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022610448650084436, + "epoch": 3.3219597550306212, + "grad_norm": 0.2940112054347992, + "learning_rate": 1e-06, + "loss": -0.0816, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.003210231261618901, + "clip_ratio/high_mean": 0.0013108856364851817, + "clip_ratio/low_mean": 0.001243566661287332, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002554452257754747, + "epoch": 3.3242927967337415, + "grad_norm": 0.36122965812683105, + "learning_rate": 1e-06, + "loss": -0.0819, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.0031171913651633076, + "clip_ratio/high_mean": 0.00118163075967459, + "clip_ratio/low_mean": 0.0014060313278605463, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002587662158475723, + "epoch": 3.326625838436862, + "grad_norm": 0.2434595227241516, + "learning_rate": 1e-06, + "loss": -0.0819, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0022868446685606614, + "clip_ratio/high_mean": 0.0009595526171324309, + "clip_ratio/low_mean": 0.0006868742293590913, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016464268555864692, + "completions/clipped_ratio": 0.2287946428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3040.0, + "completions/mean_length": 1486.864990234375, + "completions/mean_terminated_length": 712.8089599609375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 3.3289588801399823, + "grad_norm": 0.33154115080833435, + "learning_rate": 1e-06, + "loss": -0.1223, + "num_tokens": 209217779.0, + "reward": 0.5345982313156128, + "reward_std": 0.17682796716690063, + "rewards/verify_math_reward/mean": 0.5345982313156128, + "rewards/verify_math_reward/std": 0.4990801215171814, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0027643133798846975, + "clip_ratio/high_mean": 0.0012076245911885053, + "clip_ratio/low_mean": 0.0008671995437907754, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020748240931425244, + "epoch": 3.331291921843103, + "grad_norm": 0.2755907475948334, + "learning_rate": 1e-06, + "loss": -0.1226, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.0028445334028219804, + "clip_ratio/high_mean": 0.001203192128741648, + "clip_ratio/low_mean": 0.000988325117759814, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021915172765147872, + "epoch": 3.3336249635462236, + "grad_norm": 0.2327805757522583, + "learning_rate": 1e-06, + "loss": -0.1227, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.0022473930803244, + "clip_ratio/high_mean": 0.0010317661872250028, + "clip_ratio/low_mean": 0.0011683286265906645, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002200094830186572, + "epoch": 3.335958005249344, + "grad_norm": 0.3731497824192047, + "learning_rate": 1e-06, + "loss": -0.1228, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.002318192884558812, + "clip_ratio/high_mean": 0.0009074763056560187, + "clip_ratio/low_mean": 0.0005459625608636998, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014534388537867926, + "completions/clipped_ratio": 0.1417410714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3460.0, + "completions/mean_length": 1155.704345703125, + "completions/mean_terminated_length": 670.11572265625, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 3.338291046952464, + "grad_norm": 0.29542016983032227, + "learning_rate": 1e-06, + "loss": -0.0759, + "num_tokens": 209816866.0, + "reward": 0.6116071939468384, + "reward_std": 0.1679650843143463, + "rewards/verify_math_reward/mean": 0.6116071343421936, + "rewards/verify_math_reward/std": 0.48765692114830017, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.0026791319833137095, + "clip_ratio/high_mean": 0.0010337810726923635, + "clip_ratio/low_mean": 0.0007834240414013038, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018172051168221515, + "epoch": 3.3406240886555847, + "grad_norm": 0.3144708573818207, + "learning_rate": 1e-06, + "loss": -0.0762, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0030275606841314584, + "clip_ratio/high_mean": 0.0010740918824012624, + "clip_ratio/low_mean": 0.00085768296776223, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019317748447065242, + "epoch": 3.3429571303587053, + "grad_norm": 0.22165705263614655, + "learning_rate": 1e-06, + "loss": -0.0764, + "step": 1431 + }, + { + "clip_ratio/high_max": 0.0026318026721128263, + "clip_ratio/high_mean": 0.0010720016362029128, + "clip_ratio/low_mean": 0.0010904667506110854, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002162468394089956, + "epoch": 3.3452901720618256, + "grad_norm": 0.23472611606121063, + "learning_rate": 1e-06, + "loss": -0.0764, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0021777022557216696, + "clip_ratio/high_mean": 0.0008209644402086269, + "clip_ratio/low_mean": 0.00048828055128069536, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013092449844407383, + "completions/clipped_ratio": 0.1841517857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3955.0, + "completions/mean_length": 1342.7098388671875, + "completions/mean_terminated_length": 721.2421875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 3.347623213764946, + "grad_norm": 0.3252272307872772, + "learning_rate": 1e-06, + "loss": -0.0597, + "num_tokens": 210438814.0, + "reward": 0.559151828289032, + "reward_std": 0.1498156636953354, + "rewards/verify_math_reward/mean": 0.5591517686843872, + "rewards/verify_math_reward/std": 0.496766060590744, + "step": 1433 + }, + { + "clip_ratio/high_max": 0.0021948493376839906, + "clip_ratio/high_mean": 0.0007772110275254818, + "clip_ratio/low_mean": 0.0007795270157657797, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015567380032734945, + "epoch": 3.3499562554680664, + "grad_norm": 0.2555447816848755, + "learning_rate": 1e-06, + "loss": -0.0599, + "step": 1434 + }, + { + "clip_ratio/high_max": 0.0026419895366416313, + "clip_ratio/high_mean": 0.0009608177933841944, + "clip_ratio/low_mean": 0.0008887026015145238, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018495203839847818, + "epoch": 3.352289297171187, + "grad_norm": 0.2451346218585968, + "learning_rate": 1e-06, + "loss": -0.0601, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0022972883889451623, + "clip_ratio/high_mean": 0.000838000169096631, + "clip_ratio/low_mean": 0.0010169958750338992, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018549960732343607, + "epoch": 3.3546223388743073, + "grad_norm": 0.19539831578731537, + "learning_rate": 1e-06, + "loss": -0.0601, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0023833828745409846, + "clip_ratio/high_mean": 0.0009572358248988166, + "clip_ratio/low_mean": 0.000545118511581677, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015023543310235254, + "completions/clipped_ratio": 0.1573660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3901.0, + "completions/mean_length": 1180.9810791015625, + "completions/mean_terminated_length": 636.5867919921875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 3.356955380577428, + "grad_norm": 0.2828822731971741, + "learning_rate": 1e-06, + "loss": -0.0351, + "num_tokens": 211010349.0, + "reward": 0.5691964626312256, + "reward_std": 0.15639057755470276, + "rewards/verify_math_reward/mean": 0.5691964030265808, + "rewards/verify_math_reward/std": 0.4954652488231659, + "step": 1437 + }, + { + "clip_ratio/high_max": 0.002974518807604909, + "clip_ratio/high_mean": 0.0011843534666695632, + "clip_ratio/low_mean": 0.0007223030552268028, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019066565146204084, + "epoch": 3.359288422280548, + "grad_norm": 0.2754281759262085, + "learning_rate": 1e-06, + "loss": -0.0353, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.0027547199715627357, + "clip_ratio/high_mean": 0.0010415829019621015, + "clip_ratio/low_mean": 0.0009017273655445024, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001943310235219542, + "epoch": 3.361621463983669, + "grad_norm": 0.24657297134399414, + "learning_rate": 1e-06, + "loss": -0.0355, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.0024943129246821627, + "clip_ratio/high_mean": 0.0010055546554212924, + "clip_ratio/low_mean": 0.0010134600152014173, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020190146860841196, + "epoch": 3.363954505686789, + "grad_norm": 0.24397654831409454, + "learning_rate": 1e-06, + "loss": -0.0355, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.002338218291697558, + "clip_ratio/high_mean": 0.0008907749215723015, + "clip_ratio/low_mean": 0.0007159109391068341, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016066858697740827, + "completions/clipped_ratio": 0.1852678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2982.0, + "completions/mean_length": 1248.5023193359375, + "completions/mean_terminated_length": 600.9890747070312, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 3.3662875473899097, + "grad_norm": 0.5020262598991394, + "learning_rate": 1e-06, + "loss": -0.0937, + "num_tokens": 211549791.0, + "reward": 0.5837053656578064, + "reward_std": 0.16735847294330597, + "rewards/verify_math_reward/mean": 0.5837053656578064, + "rewards/verify_math_reward/std": 0.49321892857551575, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.003099871944868937, + "clip_ratio/high_mean": 0.0011718298719642917, + "clip_ratio/low_mean": 0.0009420182432222646, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002113848146109376, + "epoch": 3.36862058909303, + "grad_norm": 0.2910059690475464, + "learning_rate": 1e-06, + "loss": -0.0941, + "step": 1442 + }, + { + "clip_ratio/high_max": 0.0036351317903609015, + "clip_ratio/high_mean": 0.0012362859579297947, + "clip_ratio/low_mean": 0.0012269529543118551, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002463238903146703, + "epoch": 3.3709536307961505, + "grad_norm": 0.25305771827697754, + "learning_rate": 1e-06, + "loss": -0.0944, + "step": 1443 + }, + { + "clip_ratio/high_max": 0.00290341027721297, + "clip_ratio/high_mean": 0.0010353881625633221, + "clip_ratio/low_mean": 0.001412170080584474, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002447558203130029, + "epoch": 3.3732866724992707, + "grad_norm": 0.2992576062679291, + "learning_rate": 1e-06, + "loss": -0.0943, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0026444131799507886, + "clip_ratio/high_mean": 0.0012394175937515683, + "clip_ratio/low_mean": 0.0006469405016105156, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018863580698962323, + "completions/clipped_ratio": 0.1607142857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2743.0, + "completions/mean_length": 1167.6273193359375, + "completions/mean_terminated_length": 606.875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 3.3756197142023914, + "grad_norm": 0.3741614520549774, + "learning_rate": 1e-06, + "loss": -0.0885, + "num_tokens": 212092313.0, + "reward": 0.6037946939468384, + "reward_std": 0.19392429292201996, + "rewards/verify_math_reward/mean": 0.6037946343421936, + "rewards/verify_math_reward/std": 0.48938122391700745, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0033751697483239695, + "clip_ratio/high_mean": 0.0014848263417661656, + "clip_ratio/low_mean": 0.0009012938253363245, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00238612021348672, + "epoch": 3.377952755905512, + "grad_norm": 0.4091361463069916, + "learning_rate": 1e-06, + "loss": -0.0888, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.002985111394082196, + "clip_ratio/high_mean": 0.0014381614200829063, + "clip_ratio/low_mean": 0.0011305916850687936, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0025687531087896787, + "epoch": 3.3802857976086322, + "grad_norm": 0.27056244015693665, + "learning_rate": 1e-06, + "loss": -0.089, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.0030088677958701737, + "clip_ratio/high_mean": 0.001426322076440556, + "clip_ratio/low_mean": 0.001348124345895485, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0027744463659473695, + "epoch": 3.382618839311753, + "grad_norm": 0.4235474169254303, + "learning_rate": 1e-06, + "loss": -0.0891, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.002184545337513555, + "clip_ratio/high_mean": 0.0008029766613617539, + "clip_ratio/low_mean": 0.0007079693550622324, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015109460146049969, + "completions/clipped_ratio": 0.1741071428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2959.0, + "completions/mean_length": 1263.3070068359375, + "completions/mean_terminated_length": 666.1445922851562, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 3.384951881014873, + "grad_norm": 0.3509569764137268, + "learning_rate": 1e-06, + "loss": -0.0516, + "num_tokens": 212675356.0, + "reward": 0.5022321939468384, + "reward_std": 0.14789676666259766, + "rewards/verify_math_reward/mean": 0.5022321343421936, + "rewards/verify_math_reward/std": 0.5002743005752563, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.0027263353185844608, + "clip_ratio/high_mean": 0.0009901039575197501, + "clip_ratio/low_mean": 0.0009751847173902206, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019652886549010873, + "epoch": 3.3872849227179938, + "grad_norm": 0.2886752188205719, + "learning_rate": 1e-06, + "loss": -0.0518, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0028788219424313866, + "clip_ratio/high_mean": 0.0010585665240796516, + "clip_ratio/low_mean": 0.0011499497886688914, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022085162927396595, + "epoch": 3.389617964421114, + "grad_norm": 0.2655644714832306, + "learning_rate": 1e-06, + "loss": -0.0521, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.0024294789545820095, + "clip_ratio/high_mean": 0.0009403186340932734, + "clip_ratio/low_mean": 0.0013969198989798315, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002337238511245232, + "epoch": 3.3919510061242346, + "grad_norm": 0.41138431429862976, + "learning_rate": 1e-06, + "loss": -0.0521, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.003638951056927908, + "clip_ratio/high_mean": 0.0010821499108715216, + "clip_ratio/low_mean": 0.0006731365037921933, + "clip_ratio/low_min": 3.533069684635848e-05, + "clip_ratio/region_mean": 0.0017552864010212943, + "completions/clipped_ratio": 0.2053571428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4087.0, + "completions/mean_length": 1338.818115234375, + "completions/mean_terminated_length": 626.2879028320312, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 3.394284047827355, + "grad_norm": 0.5266408324241638, + "learning_rate": 1e-06, + "loss": -0.0852, + "num_tokens": 213219121.0, + "reward": 0.5301339626312256, + "reward_std": 0.1286219209432602, + "rewards/verify_math_reward/mean": 0.5301339030265808, + "rewards/verify_math_reward/std": 0.49936985969543457, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.0034111940694856457, + "clip_ratio/high_mean": 0.0011253326847509015, + "clip_ratio/low_mean": 0.0011274409289399046, + "clip_ratio/low_min": 3.533069684635848e-05, + "clip_ratio/region_mean": 0.002252773614600301, + "epoch": 3.3966170895304755, + "grad_norm": 0.3869584798812866, + "learning_rate": 1e-06, + "loss": -0.0854, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.003642870804469567, + "clip_ratio/high_mean": 0.001153471666839323, + "clip_ratio/low_mean": 0.0013703641252504895, + "clip_ratio/low_min": 3.533069684635848e-05, + "clip_ratio/region_mean": 0.0025238358066417277, + "epoch": 3.3989501312335957, + "grad_norm": 0.25054648518562317, + "learning_rate": 1e-06, + "loss": -0.0856, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.003084875839704182, + "clip_ratio/high_mean": 0.0010314236951671774, + "clip_ratio/low_mean": 0.0015251716185957775, + "clip_ratio/low_min": 5.299604163155891e-05, + "clip_ratio/region_mean": 0.002556595398345962, + "epoch": 3.4012831729367163, + "grad_norm": 0.29132816195487976, + "learning_rate": 1e-06, + "loss": -0.0857, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0021597528975689784, + "clip_ratio/high_mean": 0.0007703071896685287, + "clip_ratio/low_mean": 0.0005968487203062978, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013671559136128053, + "completions/clipped_ratio": 0.1595982142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3225.0, + "completions/mean_length": 1205.3270263671875, + "completions/mean_terminated_length": 656.3678588867188, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 3.4036162146398365, + "grad_norm": 0.3055138885974884, + "learning_rate": 1e-06, + "loss": -0.0559, + "num_tokens": 213798326.0, + "reward": 0.5703125, + "reward_std": 0.14917626976966858, + "rewards/verify_math_reward/mean": 0.5703125, + "rewards/verify_math_reward/std": 0.49530795216560364, + "step": 1457 + }, + { + "clip_ratio/high_max": 0.0023967510569491424, + "clip_ratio/high_mean": 0.000890681307282648, + "clip_ratio/low_mean": 0.000815241770396824, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017059230849554297, + "epoch": 3.405949256342957, + "grad_norm": 0.31193846464157104, + "learning_rate": 1e-06, + "loss": -0.056, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.0029225418911664747, + "clip_ratio/high_mean": 0.0009370192456117366, + "clip_ratio/low_mean": 0.0010417152334412094, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001978734500880819, + "epoch": 3.4082822980460774, + "grad_norm": 0.2549133002758026, + "learning_rate": 1e-06, + "loss": -0.0562, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.002612090807815548, + "clip_ratio/high_mean": 0.0008295100005852873, + "clip_ratio/low_mean": 0.001182372754556127, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020118827233090997, + "epoch": 3.410615339749198, + "grad_norm": 0.22129330039024353, + "learning_rate": 1e-06, + "loss": -0.0562, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0024314190086442977, + "clip_ratio/high_mean": 0.0008742830959818093, + "clip_ratio/low_mean": 0.0006333228957373649, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015076059935381636, + "completions/clipped_ratio": 0.1819196428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3286.0, + "completions/mean_length": 1304.188720703125, + "completions/mean_terminated_length": 683.3629150390625, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 3.4129483814523183, + "grad_norm": 0.33513981103897095, + "learning_rate": 1e-06, + "loss": -0.0537, + "num_tokens": 214401231.0, + "reward": 0.5267857313156128, + "reward_std": 0.15262722969055176, + "rewards/verify_math_reward/mean": 0.5267857313156128, + "rewards/verify_math_reward/std": 0.4995608627796173, + "step": 1461 + }, + { + "clip_ratio/high_max": 0.0030209238320821896, + "clip_ratio/high_mean": 0.0010743962702690624, + "clip_ratio/low_mean": 0.000757148836783017, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018315451088710688, + "epoch": 3.415281423155439, + "grad_norm": 0.32869017124176025, + "learning_rate": 1e-06, + "loss": -0.0539, + "step": 1462 + }, + { + "clip_ratio/high_max": 0.0027078243874711916, + "clip_ratio/high_mean": 0.0010203564634139184, + "clip_ratio/low_mean": 0.0009456528478040127, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001966009229363408, + "epoch": 3.417614464858559, + "grad_norm": 0.31274259090423584, + "learning_rate": 1e-06, + "loss": -0.054, + "step": 1463 + }, + { + "clip_ratio/high_max": 0.0029160712656448595, + "clip_ratio/high_mean": 0.0009818917023949325, + "clip_ratio/low_mean": 0.001110161920223618, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002092053597152699, + "epoch": 3.41994750656168, + "grad_norm": 0.3171895146369934, + "learning_rate": 1e-06, + "loss": -0.0542, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0025632300530560315, + "clip_ratio/high_mean": 0.0010922487563220784, + "clip_ratio/low_mean": 0.0004209585167700425, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015132072949199937, + "completions/clipped_ratio": 0.2287946428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3044.0, + "completions/mean_length": 1456.368408203125, + "completions/mean_terminated_length": 673.2648315429688, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 3.4222805482648004, + "grad_norm": 0.2822405993938446, + "learning_rate": 1e-06, + "loss": -0.1051, + "num_tokens": 214964665.0, + "reward": 0.5189732313156128, + "reward_std": 0.16979841887950897, + "rewards/verify_math_reward/mean": 0.5189732313156128, + "rewards/verify_math_reward/std": 0.49991893768310547, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0032525687711313367, + "clip_ratio/high_mean": 0.00120888634774019, + "clip_ratio/low_mean": 0.0005372365794755751, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001746122958138585, + "epoch": 3.4246135899679206, + "grad_norm": 0.26969727873802185, + "learning_rate": 1e-06, + "loss": -0.1052, + "step": 1466 + }, + { + "clip_ratio/high_max": 0.0027927601477131248, + "clip_ratio/high_mean": 0.0012141498791606864, + "clip_ratio/low_mean": 0.0008037213856368908, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020178713093628176, + "epoch": 3.4269466316710413, + "grad_norm": 0.22524547576904297, + "learning_rate": 1e-06, + "loss": -0.1054, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.0033165725326398388, + "clip_ratio/high_mean": 0.0013566705092671327, + "clip_ratio/low_mean": 0.0008546340759494342, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022113045706646517, + "epoch": 3.4292796733741615, + "grad_norm": 0.2359517365694046, + "learning_rate": 1e-06, + "loss": -0.1055, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.002432909059280064, + "clip_ratio/high_mean": 0.0009736033844092162, + "clip_ratio/low_mean": 0.0008165398685378022, + "clip_ratio/low_min": 3.7224537663860247e-05, + "clip_ratio/region_mean": 0.0017901432293001562, + "completions/clipped_ratio": 0.1339285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2316.0, + "completions/mean_length": 1084.165283203125, + "completions/mean_terminated_length": 618.41748046875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 3.431612715077282, + "grad_norm": 0.3691757023334503, + "learning_rate": 1e-06, + "loss": -0.0603, + "num_tokens": 215528637.0, + "reward": 0.5970982313156128, + "reward_std": 0.1787768304347992, + "rewards/verify_math_reward/mean": 0.5970982313156128, + "rewards/verify_math_reward/std": 0.49075523018836975, + "step": 1469 + }, + { + "clip_ratio/high_max": 0.003240038473450113, + "clip_ratio/high_mean": 0.001244753459104686, + "clip_ratio/low_mean": 0.0010759066644823179, + "clip_ratio/low_min": 2.810251862683799e-05, + "clip_ratio/region_mean": 0.002320660096302163, + "epoch": 3.4339457567804024, + "grad_norm": 0.2718658745288849, + "learning_rate": 1e-06, + "loss": -0.0606, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.003535356947395485, + "clip_ratio/high_mean": 0.0012196841926197521, + "clip_ratio/low_mean": 0.0012764189232257195, + "clip_ratio/low_min": 6.488450890174136e-05, + "clip_ratio/region_mean": 0.0024961030794656835, + "epoch": 3.436278798483523, + "grad_norm": 0.2634088099002838, + "learning_rate": 1e-06, + "loss": -0.0608, + "step": 1471 + }, + { + "clip_ratio/high_max": 0.003507355591864325, + "clip_ratio/high_mean": 0.001267346080567222, + "clip_ratio/low_mean": 0.0015165622462518513, + "clip_ratio/low_min": 9.048136416822672e-05, + "clip_ratio/region_mean": 0.002783908297715243, + "epoch": 3.4386118401866432, + "grad_norm": 0.24711427092552185, + "learning_rate": 1e-06, + "loss": -0.061, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0026092431653523818, + "clip_ratio/high_mean": 0.0009640817843319383, + "clip_ratio/low_mean": 0.0006873779857414775, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016514597373316064, + "completions/clipped_ratio": 0.1729910714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3145.0, + "completions/mean_length": 1235.1942138671875, + "completions/mean_terminated_length": 636.780029296875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 3.440944881889764, + "grad_norm": 0.33380481600761414, + "learning_rate": 1e-06, + "loss": -0.0653, + "num_tokens": 216100843.0, + "reward": 0.5345982313156128, + "reward_std": 0.1698751002550125, + "rewards/verify_math_reward/mean": 0.5345982313156128, + "rewards/verify_math_reward/std": 0.4990801215171814, + "step": 1473 + }, + { + "clip_ratio/high_max": 0.0029111165495123714, + "clip_ratio/high_mean": 0.001066847351467004, + "clip_ratio/low_mean": 0.0009605682098481338, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020274155322113074, + "epoch": 3.443277923592884, + "grad_norm": 0.3211491107940674, + "learning_rate": 1e-06, + "loss": -0.0656, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.0030401530166273005, + "clip_ratio/high_mean": 0.0011105716566817136, + "clip_ratio/low_mean": 0.001179943916213233, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002290515556524042, + "epoch": 3.4456109652960047, + "grad_norm": 0.2712908983230591, + "learning_rate": 1e-06, + "loss": -0.0657, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0028097581598558463, + "clip_ratio/high_mean": 0.0010535451365285553, + "clip_ratio/low_mean": 0.0012918122920382302, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002345357359445188, + "epoch": 3.447944006999125, + "grad_norm": 0.35899391770362854, + "learning_rate": 1e-06, + "loss": -0.0658, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0027615407088887878, + "clip_ratio/high_mean": 0.0009777019477041904, + "clip_ratio/low_mean": 0.0004897192902717507, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014674212216050364, + "completions/clipped_ratio": 0.1953125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3851.0, + "completions/mean_length": 1360.4732666015625, + "completions/mean_terminated_length": 696.5104370117188, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 3.4502770487022456, + "grad_norm": 0.30021557211875916, + "learning_rate": 1e-06, + "loss": -0.0882, + "num_tokens": 216688891.0, + "reward": 0.5401785969734192, + "reward_std": 0.13301397860050201, + "rewards/verify_math_reward/mean": 0.5401785969734192, + "rewards/verify_math_reward/std": 0.49866142868995667, + "step": 1477 + }, + { + "clip_ratio/high_max": 0.003418921187403612, + "clip_ratio/high_mean": 0.0011423223222664092, + "clip_ratio/low_mean": 0.0006081819453811477, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001750504299707245, + "epoch": 3.452610090405366, + "grad_norm": 0.2633775472640991, + "learning_rate": 1e-06, + "loss": -0.0884, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.002891820127842948, + "clip_ratio/high_mean": 0.0011583926898310892, + "clip_ratio/low_mean": 0.0007072868293107604, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001865679514594376, + "epoch": 3.4549431321084865, + "grad_norm": 0.25078412890434265, + "learning_rate": 1e-06, + "loss": -0.0885, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.003491372503049206, + "clip_ratio/high_mean": 0.0011569949419936165, + "clip_ratio/low_mean": 0.0009172863428830169, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020742812994285487, + "epoch": 3.457276173811607, + "grad_norm": 0.21249748766422272, + "learning_rate": 1e-06, + "loss": -0.0886, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.002392310043433099, + "clip_ratio/high_mean": 0.0007180312068157946, + "clip_ratio/low_mean": 0.0007742367833998287, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014922680238669273, + "completions/clipped_ratio": 0.1462053571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3920.0, + "completions/mean_length": 1145.55810546875, + "completions/mean_terminated_length": 640.3189697265625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 3.4596092155147273, + "grad_norm": 0.35187554359436035, + "learning_rate": 1e-06, + "loss": -0.0425, + "num_tokens": 217263943.0, + "reward": 0.6183035969734192, + "reward_std": 0.1450003683567047, + "rewards/verify_math_reward/mean": 0.6183035969734192, + "rewards/verify_math_reward/std": 0.4860740303993225, + "step": 1481 + }, + { + "clip_ratio/high_max": 0.0031581436760461656, + "clip_ratio/high_mean": 0.0009571097762091085, + "clip_ratio/low_mean": 0.0010297510352756944, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019868608069373295, + "epoch": 3.4619422572178475, + "grad_norm": 0.27443844079971313, + "learning_rate": 1e-06, + "loss": -0.0428, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.002752281063294504, + "clip_ratio/high_mean": 0.0008675640815454244, + "clip_ratio/low_mean": 0.0012707108498943853, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021382749073382, + "epoch": 3.464275298920968, + "grad_norm": 0.32418301701545715, + "learning_rate": 1e-06, + "loss": -0.0429, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.0028385285258991644, + "clip_ratio/high_mean": 0.00083394997909636, + "clip_ratio/low_mean": 0.0014543270044669043, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002288276948092971, + "epoch": 3.466608340624089, + "grad_norm": 0.2492295652627945, + "learning_rate": 1e-06, + "loss": -0.043, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.002522495015000459, + "clip_ratio/high_mean": 0.0011079816522396868, + "clip_ratio/low_mean": 0.0007437406347889919, + "clip_ratio/low_min": 3.380270391062368e-05, + "clip_ratio/region_mean": 0.0018517222561058588, + "completions/clipped_ratio": 0.1540178571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2589.0, + "completions/mean_length": 1192.966552734375, + "completions/mean_terminated_length": 664.4459228515625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 3.468941382327209, + "grad_norm": 0.3722512423992157, + "learning_rate": 1e-06, + "loss": -0.0637, + "num_tokens": 217863585.0, + "reward": 0.5680803656578064, + "reward_std": 0.19704709947109222, + "rewards/verify_math_reward/mean": 0.5680803656578064, + "rewards/verify_math_reward/std": 0.4956200420856476, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0029757852462353185, + "clip_ratio/high_mean": 0.0013127740094205365, + "clip_ratio/low_mean": 0.001094865772756748, + "clip_ratio/low_min": 0.00011356317554600537, + "clip_ratio/region_mean": 0.0024076397821772844, + "epoch": 3.4712744240303297, + "grad_norm": 0.31583505868911743, + "learning_rate": 1e-06, + "loss": -0.064, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.0030220092594390735, + "clip_ratio/high_mean": 0.0013473187464114744, + "clip_ratio/low_mean": 0.0012209015203552553, + "clip_ratio/low_min": 0.0001388404725730652, + "clip_ratio/region_mean": 0.00256822032679338, + "epoch": 3.47360746573345, + "grad_norm": 0.3032079339027405, + "learning_rate": 1e-06, + "loss": -0.0642, + "step": 1487 + }, + { + "clip_ratio/high_max": 0.002851707373338286, + "clip_ratio/high_mean": 0.0012149710855737794, + "clip_ratio/low_mean": 0.0015116343056433834, + "clip_ratio/low_min": 0.00014014556109032128, + "clip_ratio/region_mean": 0.0027266053948551416, + "epoch": 3.4759405074365706, + "grad_norm": 0.2756774127483368, + "learning_rate": 1e-06, + "loss": -0.0643, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0022743008157704026, + "clip_ratio/high_mean": 0.0008011747177079087, + "clip_ratio/low_mean": 0.000605744857693935, + "clip_ratio/low_min": 1.3760458386968821e-05, + "clip_ratio/region_mean": 0.0014069195785850752, + "completions/clipped_ratio": 0.2064732142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3992.0, + "completions/mean_length": 1348.6273193359375, + "completions/mean_terminated_length": 633.7693481445312, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 3.478273549139691, + "grad_norm": 0.40627923607826233, + "learning_rate": 1e-06, + "loss": -0.0719, + "num_tokens": 218403563.0, + "reward": 0.5479910969734192, + "reward_std": 0.14673490822315216, + "rewards/verify_math_reward/mean": 0.5479910969734192, + "rewards/verify_math_reward/std": 0.49796950817108154, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.003021533993887715, + "clip_ratio/high_mean": 0.0011017732085747411, + "clip_ratio/low_mean": 0.0009259931696306012, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002027766386163421, + "epoch": 3.4806065908428114, + "grad_norm": 0.3433954119682312, + "learning_rate": 1e-06, + "loss": -0.0722, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.002794168824038934, + "clip_ratio/high_mean": 0.0009876603235170478, + "clip_ratio/low_mean": 0.0010355198619436123, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002023180182732176, + "epoch": 3.4829396325459316, + "grad_norm": 0.3085142970085144, + "learning_rate": 1e-06, + "loss": -0.0724, + "step": 1491 + }, + { + "clip_ratio/high_max": 0.0029426438704831526, + "clip_ratio/high_mean": 0.000971582294369, + "clip_ratio/low_mean": 0.0011516471972754516, + "clip_ratio/low_min": 2.1566596842603758e-05, + "clip_ratio/region_mean": 0.0021232294602668844, + "epoch": 3.4852726742490523, + "grad_norm": 0.25115662813186646, + "learning_rate": 1e-06, + "loss": -0.0724, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0029007212397118565, + "clip_ratio/high_mean": 0.0010477241830813, + "clip_ratio/low_mean": 0.0006721846602886217, + "clip_ratio/low_min": 1.7053205738193356e-05, + "clip_ratio/region_mean": 0.0017199088288180064, + "completions/clipped_ratio": 0.1808035714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3476.0, + "completions/mean_length": 1296.5703125, + "completions/mean_terminated_length": 678.7125244140625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 3.4876057159521725, + "grad_norm": 0.3308228552341461, + "learning_rate": 1e-06, + "loss": -0.083, + "num_tokens": 218999322.0, + "reward": 0.5234375, + "reward_std": 0.17002595961093903, + "rewards/verify_math_reward/mean": 0.5234375, + "rewards/verify_math_reward/std": 0.49972933530807495, + "step": 1493 + }, + { + "clip_ratio/high_max": 0.0030103138415142894, + "clip_ratio/high_mean": 0.0012242803904882749, + "clip_ratio/low_mean": 0.0009718649816932157, + "clip_ratio/low_min": 1.2447719200281426e-05, + "clip_ratio/region_mean": 0.0021961453821859322, + "epoch": 3.489938757655293, + "grad_norm": 0.3048311471939087, + "learning_rate": 1e-06, + "loss": -0.0832, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.0033319630529149435, + "clip_ratio/high_mean": 0.0012709472885035211, + "clip_ratio/low_mean": 0.001170280753285624, + "clip_ratio/low_min": 2.4895438400562853e-05, + "clip_ratio/region_mean": 0.0024412280108663253, + "epoch": 3.4922717993584134, + "grad_norm": 0.26479119062423706, + "learning_rate": 1e-06, + "loss": -0.0834, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0031683956112829037, + "clip_ratio/high_mean": 0.0011402380714571336, + "clip_ratio/low_mean": 0.001393181777530117, + "clip_ratio/low_min": 3.734315760084428e-05, + "clip_ratio/region_mean": 0.0025334198217024095, + "epoch": 3.494604841061534, + "grad_norm": 0.25681331753730774, + "learning_rate": 1e-06, + "loss": -0.0835, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0023501041723648086, + "clip_ratio/high_mean": 0.000847990909278451, + "clip_ratio/low_mean": 0.00043196730780437065, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00127995821821969, + "completions/clipped_ratio": 0.1986607142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3779.0, + "completions/mean_length": 1313.32373046875, + "completions/mean_terminated_length": 623.4679565429688, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 3.4969378827646542, + "grad_norm": 0.23563139140605927, + "learning_rate": 1e-06, + "loss": -0.0622, + "num_tokens": 219524948.0, + "reward": 0.5345982313156128, + "reward_std": 0.12831631302833557, + "rewards/verify_math_reward/mean": 0.5345982313156128, + "rewards/verify_math_reward/std": 0.4990801215171814, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.0026061613607453182, + "clip_ratio/high_mean": 0.0009985405467887176, + "clip_ratio/low_mean": 0.0006235589321477164, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016220994584728032, + "epoch": 3.499270924467775, + "grad_norm": 0.23240315914154053, + "learning_rate": 1e-06, + "loss": -0.0623, + "step": 1498 + }, + { + "clip_ratio/high_max": 0.002646317963808542, + "clip_ratio/high_mean": 0.0009712394812595448, + "clip_ratio/low_mean": 0.0006868788418614713, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016581182972004171, + "epoch": 3.5016039661708955, + "grad_norm": 0.2139987349510193, + "learning_rate": 1e-06, + "loss": -0.0624, + "step": 1499 + }, + { + "clip_ratio/high_max": 0.0025528997921355767, + "clip_ratio/high_mean": 0.0009007496255435399, + "clip_ratio/low_mean": 0.0009149990187324875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018157487211283296, + "epoch": 3.5039370078740157, + "grad_norm": 0.22334018349647522, + "learning_rate": 1e-06, + "loss": -0.0625, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0022797048441134393, + "clip_ratio/high_mean": 0.0009540160008327803, + "clip_ratio/low_mean": 0.0005550076903091394, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015090236738615204, + "completions/clipped_ratio": 0.1841517857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4064.0, + "completions/mean_length": 1354.9676513671875, + "completions/mean_terminated_length": 736.2667846679688, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 3.506270049577136, + "grad_norm": 0.3129500448703766, + "learning_rate": 1e-06, + "loss": -0.0965, + "num_tokens": 220151575.0, + "reward": 0.5792410969734192, + "reward_std": 0.17836888134479523, + "rewards/verify_math_reward/mean": 0.5792410969734192, + "rewards/verify_math_reward/std": 0.49395665526390076, + "step": 1501 + }, + { + "clip_ratio/high_max": 0.0026659188079065643, + "clip_ratio/high_mean": 0.0010767554540507263, + "clip_ratio/low_mean": 0.0008429652934864862, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019197206638636999, + "epoch": 3.5086030912802566, + "grad_norm": 0.2578331232070923, + "learning_rate": 1e-06, + "loss": -0.0969, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.0027675622841343284, + "clip_ratio/high_mean": 0.0011148411813337589, + "clip_ratio/low_mean": 0.0009087679827644024, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020236091877450235, + "epoch": 3.5109361329833773, + "grad_norm": 0.24845904111862183, + "learning_rate": 1e-06, + "loss": -0.097, + "step": 1503 + }, + { + "clip_ratio/high_max": 0.002540670393500477, + "clip_ratio/high_mean": 0.001044667356836726, + "clip_ratio/low_mean": 0.0011574263771763071, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002202093724918086, + "epoch": 3.5132691746864975, + "grad_norm": 0.292165607213974, + "learning_rate": 1e-06, + "loss": -0.0971, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0026543206186033785, + "clip_ratio/high_mean": 0.000993739169643959, + "clip_ratio/low_mean": 0.0007413307284878101, + "clip_ratio/low_min": 4.1377028537681326e-05, + "clip_ratio/region_mean": 0.001735069883579854, + "completions/clipped_ratio": 0.1964285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4063.0, + "completions/mean_length": 1401.6429443359375, + "completions/mean_terminated_length": 743.022216796875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 3.515602216389618, + "grad_norm": 0.35746651887893677, + "learning_rate": 1e-06, + "loss": -0.0986, + "num_tokens": 220782847.0, + "reward": 0.5323660969734192, + "reward_std": 0.15777993202209473, + "rewards/verify_math_reward/mean": 0.5323660969734192, + "rewards/verify_math_reward/std": 0.4992299973964691, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0028942258359165862, + "clip_ratio/high_mean": 0.0011398786791687598, + "clip_ratio/low_mean": 0.0008631971413706196, + "clip_ratio/low_min": 1.7467858924646862e-05, + "clip_ratio/region_mean": 0.0020030757877975702, + "epoch": 3.5179352580927383, + "grad_norm": 0.3008044958114624, + "learning_rate": 1e-06, + "loss": -0.0988, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.003281142056948738, + "clip_ratio/high_mean": 0.0011600244151850347, + "clip_ratio/low_mean": 0.0010390899451522273, + "clip_ratio/low_min": 1.840942604758311e-05, + "clip_ratio/region_mean": 0.002199114329414442, + "epoch": 3.520268299795859, + "grad_norm": 0.2596571147441864, + "learning_rate": 1e-06, + "loss": -0.099, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.002689667609956814, + "clip_ratio/high_mean": 0.0010504342744752648, + "clip_ratio/low_mean": 0.0013327549550012918, + "clip_ratio/low_min": 3.4935717849293724e-05, + "clip_ratio/region_mean": 0.0023831892031012103, + "epoch": 3.522601341498979, + "grad_norm": 0.21419784426689148, + "learning_rate": 1e-06, + "loss": -0.0991, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.002025470144872088, + "clip_ratio/high_mean": 0.000934464811507496, + "clip_ratio/low_mean": 0.0007760484650134458, + "clip_ratio/low_min": 1.0796337846841197e-05, + "clip_ratio/region_mean": 0.0017105132646975107, + "completions/clipped_ratio": 0.1640625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4023.0, + "completions/mean_length": 1251.680908203125, + "completions/mean_terminated_length": 693.449951171875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "epoch": 3.5249343832021, + "grad_norm": 0.3148497939109802, + "learning_rate": 1e-06, + "loss": -0.0672, + "num_tokens": 221397097.0, + "reward": 0.566964328289032, + "reward_std": 0.1840411275625229, + "rewards/verify_math_reward/mean": 0.5669642686843872, + "rewards/verify_math_reward/std": 0.49577224254608154, + "step": 1509 + }, + { + "clip_ratio/high_max": 0.002676955111382995, + "clip_ratio/high_mean": 0.0011807695045717992, + "clip_ratio/low_mean": 0.0010588440036372049, + "clip_ratio/low_min": 1.955977495526895e-05, + "clip_ratio/region_mean": 0.002239613422716502, + "epoch": 3.52726742490522, + "grad_norm": 0.3090699017047882, + "learning_rate": 1e-06, + "loss": -0.0675, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0025940429040929303, + "clip_ratio/high_mean": 0.001103127473470522, + "clip_ratio/low_mean": 0.0012223327503306791, + "clip_ratio/low_min": 1.955977495526895e-05, + "clip_ratio/region_mean": 0.0023254601765074767, + "epoch": 3.5296004666083407, + "grad_norm": 0.2467300146818161, + "learning_rate": 1e-06, + "loss": -0.0676, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.002437049952277448, + "clip_ratio/high_mean": 0.0010338622214476345, + "clip_ratio/low_mean": 0.0014347818942042068, + "clip_ratio/low_min": 1.3039849363849498e-05, + "clip_ratio/region_mean": 0.0024686441174708307, + "epoch": 3.531933508311461, + "grad_norm": 0.2955998182296753, + "learning_rate": 1e-06, + "loss": -0.0677, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0023475160778616555, + "clip_ratio/high_mean": 0.000987039549727342, + "clip_ratio/low_mean": 0.000704028707332327, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016910682679736055, + "completions/clipped_ratio": 0.1707589285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3899.0, + "completions/mean_length": 1262.3326416015625, + "completions/mean_terminated_length": 678.8182983398438, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 3.5342665500145816, + "grad_norm": 0.35950562357902527, + "learning_rate": 1e-06, + "loss": -0.1186, + "num_tokens": 221986099.0, + "reward": 0.598214328289032, + "reward_std": 0.16904954612255096, + "rewards/verify_math_reward/mean": 0.5982142686843872, + "rewards/verify_math_reward/std": 0.49053287506103516, + "step": 1513 + }, + { + "clip_ratio/high_max": 0.0035973012636532076, + "clip_ratio/high_mean": 0.001298691571719246, + "clip_ratio/low_mean": 0.0008401642062381143, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021388557652244344, + "epoch": 3.536599591717702, + "grad_norm": 0.27253785729408264, + "learning_rate": 1e-06, + "loss": -0.1188, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.0032188785189646296, + "clip_ratio/high_mean": 0.0012338145006651757, + "clip_ratio/low_mean": 0.0010743178299890133, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002308132352482062, + "epoch": 3.5389326334208224, + "grad_norm": 0.22632579505443573, + "learning_rate": 1e-06, + "loss": -0.119, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.00277203215227928, + "clip_ratio/high_mean": 0.0011150104546686634, + "clip_ratio/low_mean": 0.0012494994443841279, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023645098917768337, + "epoch": 3.5412656751239426, + "grad_norm": 0.233441561460495, + "learning_rate": 1e-06, + "loss": -0.119, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0033738151541911066, + "clip_ratio/high_mean": 0.0010480080181878293, + "clip_ratio/low_mean": 0.000549867203062604, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015978752744558733, + "completions/clipped_ratio": 0.1964285714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3058.0, + "completions/mean_length": 1312.8984375, + "completions/mean_terminated_length": 632.584716796875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 3.5435987168270633, + "grad_norm": 0.3846149146556854, + "learning_rate": 1e-06, + "loss": -0.0588, + "num_tokens": 222541640.0, + "reward": 0.53125, + "reward_std": 0.13775329291820526, + "rewards/verify_math_reward/mean": 0.53125, + "rewards/verify_math_reward/std": 0.4993011951446533, + "step": 1517 + }, + { + "clip_ratio/high_max": 0.003223258572688792, + "clip_ratio/high_mean": 0.0011083738590969006, + "clip_ratio/low_mean": 0.0009958546870620921, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021042284861323424, + "epoch": 3.545931758530184, + "grad_norm": 0.3291001319885254, + "learning_rate": 1e-06, + "loss": -0.059, + "step": 1518 + }, + { + "clip_ratio/high_max": 0.0034833777172025293, + "clip_ratio/high_mean": 0.0011976627065450884, + "clip_ratio/low_mean": 0.001046523768309271, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022441865148721263, + "epoch": 3.548264800233304, + "grad_norm": 0.4383046329021454, + "learning_rate": 1e-06, + "loss": -0.0591, + "step": 1519 + }, + { + "clip_ratio/high_max": 0.0036275723250582814, + "clip_ratio/high_mean": 0.0011557021680346224, + "clip_ratio/low_mean": 0.0012577502566273324, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002413452424661955, + "epoch": 3.5505978419364244, + "grad_norm": 0.2779461145401001, + "learning_rate": 1e-06, + "loss": -0.0592, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0026107412777491845, + "clip_ratio/high_mean": 0.0011020833007933106, + "clip_ratio/low_mean": 0.0007288959131983574, + "clip_ratio/low_min": 1.5644554878235795e-05, + "clip_ratio/region_mean": 0.00183097917761188, + "completions/clipped_ratio": 0.1819196428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3292.0, + "completions/mean_length": 1295.583740234375, + "completions/mean_terminated_length": 672.844482421875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 3.552930883639545, + "grad_norm": 0.35715851187705994, + "learning_rate": 1e-06, + "loss": -0.1101, + "num_tokens": 223118067.0, + "reward": 0.5714285969734192, + "reward_std": 0.20850077271461487, + "rewards/verify_math_reward/mean": 0.5714285969734192, + "rewards/verify_math_reward/std": 0.49514803290367126, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.0031851034073042683, + "clip_ratio/high_mean": 0.0013549956274800934, + "clip_ratio/low_mean": 0.0009374552846566075, + "clip_ratio/low_min": 3.128910975647159e-05, + "clip_ratio/region_mean": 0.0022924509103177115, + "epoch": 3.5552639253426657, + "grad_norm": 0.30691832304000854, + "learning_rate": 1e-06, + "loss": -0.1104, + "step": 1522 + }, + { + "clip_ratio/high_max": 0.003452163393376395, + "clip_ratio/high_mean": 0.0013758698623860255, + "clip_ratio/low_mean": 0.0010093992796100792, + "clip_ratio/low_min": 1.7370761270285584e-05, + "clip_ratio/region_mean": 0.002385269144724589, + "epoch": 3.557596967045786, + "grad_norm": 0.2864389717578888, + "learning_rate": 1e-06, + "loss": -0.1106, + "step": 1523 + }, + { + "clip_ratio/high_max": 0.003090730773692485, + "clip_ratio/high_mean": 0.0013269843148009386, + "clip_ratio/low_mean": 0.0013503560730896425, + "clip_ratio/low_min": 3.098661545664072e-05, + "clip_ratio/region_mean": 0.002677340409718454, + "epoch": 3.5599300087489065, + "grad_norm": 0.25327908992767334, + "learning_rate": 1e-06, + "loss": -0.1107, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0025775858885026537, + "clip_ratio/high_mean": 0.000957845215452835, + "clip_ratio/low_mean": 0.0007984565493188711, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017563017754582688, + "completions/clipped_ratio": 0.1685267857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3018.0, + "completions/mean_length": 1227.01904296875, + "completions/mean_terminated_length": 645.5208129882812, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 3.5622630504520267, + "grad_norm": 0.4949190318584442, + "learning_rate": 1e-06, + "loss": -0.0815, + "num_tokens": 223689812.0, + "reward": 0.5959821939468384, + "reward_std": 0.1830640286207199, + "rewards/verify_math_reward/mean": 0.5959821343421936, + "rewards/verify_math_reward/std": 0.490975022315979, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0029955473728477955, + "clip_ratio/high_mean": 0.0011499087486299686, + "clip_ratio/low_mean": 0.0010720030668380787, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022219118181965314, + "epoch": 3.5645960921551474, + "grad_norm": 0.28687843680381775, + "learning_rate": 1e-06, + "loss": -0.0817, + "step": 1526 + }, + { + "clip_ratio/high_max": 0.0034787955810315907, + "clip_ratio/high_mean": 0.0011736647647921927, + "clip_ratio/low_mean": 0.0011168585679115495, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002290523327246774, + "epoch": 3.5669291338582676, + "grad_norm": 0.25637078285217285, + "learning_rate": 1e-06, + "loss": -0.0819, + "step": 1527 + }, + { + "clip_ratio/high_max": 0.00323119483073242, + "clip_ratio/high_mean": 0.0011336969037074596, + "clip_ratio/low_mean": 0.0014666480810774374, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002600344938400667, + "epoch": 3.5692621755613883, + "grad_norm": 0.32233622670173645, + "learning_rate": 1e-06, + "loss": -0.0821, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.002661401405930519, + "clip_ratio/high_mean": 0.0010299417026544688, + "clip_ratio/low_mean": 0.0004914766668662196, + "clip_ratio/low_min": 1.4328289580589626e-05, + "clip_ratio/region_mean": 0.0015214183513307944, + "completions/clipped_ratio": 0.1819196428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3200.0, + "completions/mean_length": 1296.469970703125, + "completions/mean_terminated_length": 673.9276733398438, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 3.5715952172645085, + "grad_norm": 0.3157018721103668, + "learning_rate": 1e-06, + "loss": -0.0676, + "num_tokens": 224279809.0, + "reward": 0.5491071939468384, + "reward_std": 0.16270402073860168, + "rewards/verify_math_reward/mean": 0.5491071343421936, + "rewards/verify_math_reward/std": 0.49786055088043213, + "step": 1529 + }, + { + "clip_ratio/high_max": 0.0030244411318562925, + "clip_ratio/high_mean": 0.0011270798877376365, + "clip_ratio/low_mean": 0.0007608474184053193, + "clip_ratio/low_min": 1.735147088766098e-05, + "clip_ratio/region_mean": 0.0018879273447964806, + "epoch": 3.573928258967629, + "grad_norm": 0.28625938296318054, + "learning_rate": 1e-06, + "loss": -0.0679, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0030160953974700533, + "clip_ratio/high_mean": 0.0011641476585282362, + "clip_ratio/low_mean": 0.0009189475094899535, + "clip_ratio/low_min": 3.470294177532196e-05, + "clip_ratio/region_mean": 0.0020830951689276844, + "epoch": 3.5762613006707493, + "grad_norm": 0.23650334775447845, + "learning_rate": 1e-06, + "loss": -0.068, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.0031486430452787317, + "clip_ratio/high_mean": 0.0011302901166345691, + "clip_ratio/low_mean": 0.0010223674125882098, + "clip_ratio/low_min": 8.596973930252716e-05, + "clip_ratio/region_mean": 0.002152657529222779, + "epoch": 3.57859434237387, + "grad_norm": 0.21814867854118347, + "learning_rate": 1e-06, + "loss": -0.0681, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0026995788648491725, + "clip_ratio/high_mean": 0.0010108518345077755, + "clip_ratio/low_mean": 0.0006342021315504098, + "clip_ratio/low_min": 2.504006442904938e-05, + "clip_ratio/region_mean": 0.0016450539478682913, + "completions/clipped_ratio": 0.1830357142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2487.0, + "completions/mean_length": 1274.7288818359375, + "completions/mean_terminated_length": 642.6406860351562, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 3.5809273840769906, + "grad_norm": 0.42358139157295227, + "learning_rate": 1e-06, + "loss": -0.0879, + "num_tokens": 224843742.0, + "reward": 0.5881696939468384, + "reward_std": 0.16555650532245636, + "rewards/verify_math_reward/mean": 0.5881696343421936, + "rewards/verify_math_reward/std": 0.4924395978450775, + "step": 1533 + }, + { + "clip_ratio/high_max": 0.0031136035977397114, + "clip_ratio/high_mean": 0.0012501615965447854, + "clip_ratio/low_mean": 0.0009599393051757943, + "clip_ratio/low_min": 1.252003221452469e-05, + "clip_ratio/region_mean": 0.0022101009089965373, + "epoch": 3.583260425780111, + "grad_norm": 0.31278711557388306, + "learning_rate": 1e-06, + "loss": -0.0883, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.0035869555504177697, + "clip_ratio/high_mean": 0.001279306146898307, + "clip_ratio/low_mean": 0.0011774325430451427, + "clip_ratio/low_min": 5.008012885809876e-05, + "clip_ratio/region_mean": 0.0024567386863054708, + "epoch": 3.585593467483231, + "grad_norm": 0.26393356919288635, + "learning_rate": 1e-06, + "loss": -0.0886, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.00319365114410175, + "clip_ratio/high_mean": 0.0011895089846802875, + "clip_ratio/low_mean": 0.0013025746593484655, + "clip_ratio/low_min": 3.052503234357573e-05, + "clip_ratio/region_mean": 0.0024920836149249226, + "epoch": 3.5879265091863517, + "grad_norm": 0.27731215953826904, + "learning_rate": 1e-06, + "loss": -0.0885, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.003128183823719155, + "clip_ratio/high_mean": 0.0011135051026940346, + "clip_ratio/low_mean": 0.0005207825106481323, + "clip_ratio/low_min": 1.7472742911195382e-05, + "clip_ratio/region_mean": 0.0016342875969712622, + "completions/clipped_ratio": 0.1819196428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3815.0, + "completions/mean_length": 1286.3895263671875, + "completions/mean_terminated_length": 661.605712890625, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 3.5902595508894724, + "grad_norm": 0.3832108676433563, + "learning_rate": 1e-06, + "loss": -0.0734, + "num_tokens": 225421971.0, + "reward": 0.5301339626312256, + "reward_std": 0.17659901082515717, + "rewards/verify_math_reward/mean": 0.5301339030265808, + "rewards/verify_math_reward/std": 0.49936985969543457, + "step": 1537 + }, + { + "clip_ratio/high_max": 0.003468630740826484, + "clip_ratio/high_mean": 0.0012430049791873898, + "clip_ratio/low_mean": 0.0006966114424358238, + "clip_ratio/low_min": 2.518130531825591e-05, + "clip_ratio/region_mean": 0.0019396164425415918, + "epoch": 3.5925925925925926, + "grad_norm": 0.28782737255096436, + "learning_rate": 1e-06, + "loss": -0.0736, + "step": 1538 + }, + { + "clip_ratio/high_max": 0.00373454752843827, + "clip_ratio/high_mean": 0.0013299306010594591, + "clip_ratio/low_mean": 0.000834752927403315, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0021646835084538907, + "epoch": 3.5949256342957128, + "grad_norm": 0.2644355893135071, + "learning_rate": 1e-06, + "loss": -0.0738, + "step": 1539 + }, + { + "clip_ratio/high_max": 0.0035358406603336334, + "clip_ratio/high_mean": 0.0012311937171034515, + "clip_ratio/low_mean": 0.0010186804520344594, + "clip_ratio/low_min": 4.148689185967669e-05, + "clip_ratio/region_mean": 0.002249874160042964, + "epoch": 3.5972586759988334, + "grad_norm": 0.29489627480506897, + "learning_rate": 1e-06, + "loss": -0.0739, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.003314832487376407, + "clip_ratio/high_mean": 0.0010940357988147298, + "clip_ratio/low_mean": 0.0006898802121213521, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017839160209405236, + "completions/clipped_ratio": 0.2299107142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2849.0, + "completions/mean_length": 1440.69091796875, + "completions/mean_terminated_length": 647.9464111328125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 3.599591717701954, + "grad_norm": 0.39426520466804504, + "learning_rate": 1e-06, + "loss": -0.1001, + "num_tokens": 225952342.0, + "reward": 0.5412946939468384, + "reward_std": 0.1713072657585144, + "rewards/verify_math_reward/mean": 0.5412946343421936, + "rewards/verify_math_reward/std": 0.49857014417648315, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.0036451994528761134, + "clip_ratio/high_mean": 0.0013332387898117304, + "clip_ratio/low_mean": 0.0009492857570876367, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002282524590555113, + "epoch": 3.6019247594050743, + "grad_norm": 0.46961215138435364, + "learning_rate": 1e-06, + "loss": -0.1004, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.003487508642137982, + "clip_ratio/high_mean": 0.0013010401034989627, + "clip_ratio/low_mean": 0.0011625834558799397, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002463623481162358, + "epoch": 3.604257801108195, + "grad_norm": 0.26113542914390564, + "learning_rate": 1e-06, + "loss": -0.1006, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.003831864560197573, + "clip_ratio/high_mean": 0.0012798725147149526, + "clip_ratio/low_mean": 0.0014721531297254842, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0027520255971467122, + "epoch": 3.606590842811315, + "grad_norm": 0.2875174283981323, + "learning_rate": 1e-06, + "loss": -0.1007, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.003023206598300021, + "clip_ratio/high_mean": 0.001271988690859871, + "clip_ratio/low_mean": 0.0006859922905277926, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019579810250434093, + "completions/clipped_ratio": 0.2198660714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2363.0, + "completions/mean_length": 1441.9923095703125, + "completions/mean_terminated_length": 694.010009765625, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 3.608923884514436, + "grad_norm": 0.4443901777267456, + "learning_rate": 1e-06, + "loss": -0.1147, + "num_tokens": 226527839.0, + "reward": 0.5691964626312256, + "reward_std": 0.20523346960544586, + "rewards/verify_math_reward/mean": 0.5691964030265808, + "rewards/verify_math_reward/std": 0.4954652488231659, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0034600492435856722, + "clip_ratio/high_mean": 0.0014443448417296167, + "clip_ratio/low_mean": 0.0009484658476139884, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023928107184474356, + "epoch": 3.611256926217556, + "grad_norm": 0.27703621983528137, + "learning_rate": 1e-06, + "loss": -0.115, + "step": 1546 + }, + { + "clip_ratio/high_max": 0.003560012446541805, + "clip_ratio/high_mean": 0.0013946929029771127, + "clip_ratio/low_mean": 0.0011226687602174934, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0025173616813845, + "epoch": 3.6135899679206767, + "grad_norm": 0.32288092374801636, + "learning_rate": 1e-06, + "loss": -0.1152, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.003413889651710633, + "clip_ratio/high_mean": 0.0014856893867545296, + "clip_ratio/low_mean": 0.0013747827833867632, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002860472144675441, + "epoch": 3.615923009623797, + "grad_norm": 0.30773279070854187, + "learning_rate": 1e-06, + "loss": -0.1154, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0021399538090918213, + "clip_ratio/high_mean": 0.0008620997614343651, + "clip_ratio/low_mean": 0.0005568014967138879, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014189012617862318, + "completions/clipped_ratio": 0.2477678571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2899.0, + "completions/mean_length": 1520.0301513671875, + "completions/mean_terminated_length": 671.5653076171875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 3.6182560513269175, + "grad_norm": 0.38228416442871094, + "learning_rate": 1e-06, + "loss": -0.0771, + "num_tokens": 227069810.0, + "reward": 0.5189732313156128, + "reward_std": 0.17058978974819183, + "rewards/verify_math_reward/mean": 0.5189732313156128, + "rewards/verify_math_reward/std": 0.49991893768310547, + "step": 1549 + }, + { + "clip_ratio/high_max": 0.0030548020695277955, + "clip_ratio/high_mean": 0.001068041374310269, + "clip_ratio/low_mean": 0.0009025194267451297, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001970560821064282, + "epoch": 3.6205890930300377, + "grad_norm": 0.32157832384109497, + "learning_rate": 1e-06, + "loss": -0.0774, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.002751321553660091, + "clip_ratio/high_mean": 0.0010448119501234032, + "clip_ratio/low_mean": 0.001041971154336352, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020867831117357127, + "epoch": 3.6229221347331584, + "grad_norm": 0.24815063178539276, + "learning_rate": 1e-06, + "loss": -0.0776, + "step": 1551 + }, + { + "clip_ratio/high_max": 0.0026036685085273348, + "clip_ratio/high_mean": 0.0009824264798226068, + "clip_ratio/low_mean": 0.0012956580212630797, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022780845465604216, + "epoch": 3.625255176436279, + "grad_norm": 0.3354860544204712, + "learning_rate": 1e-06, + "loss": -0.0776, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0030948232670198195, + "clip_ratio/high_mean": 0.001258082458662102, + "clip_ratio/low_mean": 0.0007376344910881016, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019957169497502036, + "completions/clipped_ratio": 0.1662946428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2397.0, + "completions/mean_length": 1174.458740234375, + "completions/mean_terminated_length": 591.71484375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 3.6275882181393992, + "grad_norm": 0.37742355465888977, + "learning_rate": 1e-06, + "loss": -0.0925, + "num_tokens": 227601141.0, + "reward": 0.606026828289032, + "reward_std": 0.15725818276405334, + "rewards/verify_math_reward/mean": 0.6060267686843872, + "rewards/verify_math_reward/std": 0.48890194296836853, + "step": 1553 + }, + { + "clip_ratio/high_max": 0.003571727851522155, + "clip_ratio/high_mean": 0.0013547524649766274, + "clip_ratio/low_mean": 0.0009574865052854875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002312238997546956, + "epoch": 3.6299212598425195, + "grad_norm": 0.364230751991272, + "learning_rate": 1e-06, + "loss": -0.0927, + "step": 1554 + }, + { + "clip_ratio/high_max": 0.003034077410120517, + "clip_ratio/high_mean": 0.0012617486208910123, + "clip_ratio/low_mean": 0.0011363514995537116, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0023981001213542186, + "epoch": 3.63225430154564, + "grad_norm": 0.3032190203666687, + "learning_rate": 1e-06, + "loss": -0.093, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0028370428626658395, + "clip_ratio/high_mean": 0.001281575325265294, + "clip_ratio/low_mean": 0.0013283705375215504, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002609945928270463, + "epoch": 3.6345873432487608, + "grad_norm": 0.27822694182395935, + "learning_rate": 1e-06, + "loss": -0.0931, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.002680474444787251, + "clip_ratio/high_mean": 0.0010216867940471275, + "clip_ratio/low_mean": 0.0005964219117231551, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016181086975848302, + "completions/clipped_ratio": 0.2142857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3919.0, + "completions/mean_length": 1436.2857666015625, + "completions/mean_terminated_length": 710.9091186523438, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 3.636920384951881, + "grad_norm": 0.3404932916164398, + "learning_rate": 1e-06, + "loss": -0.0837, + "num_tokens": 228181853.0, + "reward": 0.4988839626312256, + "reward_std": 0.18201345205307007, + "rewards/verify_math_reward/mean": 0.4988839328289032, + "rewards/verify_math_reward/std": 0.5002779960632324, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.0031010887505544815, + "clip_ratio/high_mean": 0.00122041084432567, + "clip_ratio/low_mean": 0.0008648375960547128, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002085248452203814, + "epoch": 3.6392534266550016, + "grad_norm": 0.31997114419937134, + "learning_rate": 1e-06, + "loss": -0.0839, + "step": 1558 + }, + { + "clip_ratio/high_max": 0.003221661929273978, + "clip_ratio/high_mean": 0.001245005041710101, + "clip_ratio/low_mean": 0.001028126946039265, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022731319768354297, + "epoch": 3.641586468358122, + "grad_norm": 0.23597189784049988, + "learning_rate": 1e-06, + "loss": -0.0841, + "step": 1559 + }, + { + "clip_ratio/high_max": 0.0029438552883220837, + "clip_ratio/high_mean": 0.0011887395639860188, + "clip_ratio/low_mean": 0.0012269534418010153, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002415692993963603, + "epoch": 3.6439195100612425, + "grad_norm": 0.2510083317756653, + "learning_rate": 1e-06, + "loss": -0.0842, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0028251345502212644, + "clip_ratio/high_mean": 0.0012001818868156988, + "clip_ratio/low_mean": 0.0006806998262618436, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018808816967066377, + "completions/clipped_ratio": 0.2142857142857143, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3896.0, + "completions/mean_length": 1424.5101318359375, + "completions/mean_terminated_length": 695.921875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 3.6462525517643627, + "grad_norm": 0.3832797408103943, + "learning_rate": 1e-06, + "loss": -0.1433, + "num_tokens": 228758326.0, + "reward": 0.574776828289032, + "reward_std": 0.2134273499250412, + "rewards/verify_math_reward/mean": 0.5747767686843872, + "rewards/verify_math_reward/std": 0.49465295672416687, + "step": 1561 + }, + { + "clip_ratio/high_max": 0.0038279997825156897, + "clip_ratio/high_mean": 0.001497450426541036, + "clip_ratio/low_mean": 0.0009243143595085712, + "clip_ratio/low_min": 1.1888910194102209e-05, + "clip_ratio/region_mean": 0.0024217647878685966, + "epoch": 3.6485855934674833, + "grad_norm": 0.33032625913619995, + "learning_rate": 1e-06, + "loss": -0.1436, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.0033771845774026588, + "clip_ratio/high_mean": 0.001481091407185886, + "clip_ratio/low_mean": 0.001131790140789235, + "clip_ratio/low_min": 7.760118933219928e-06, + "clip_ratio/region_mean": 0.0026128815370611846, + "epoch": 3.6509186351706036, + "grad_norm": 0.41828563809394836, + "learning_rate": 1e-06, + "loss": -0.1437, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.003442059605731629, + "clip_ratio/high_mean": 0.0013597105171356816, + "clip_ratio/low_mean": 0.0012348913332971279, + "clip_ratio/low_min": 2.3280357709154487e-05, + "clip_ratio/region_mean": 0.002594601915916428, + "epoch": 3.653251676873724, + "grad_norm": 0.2683136463165283, + "learning_rate": 1e-06, + "loss": -0.144, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.003076203509408515, + "clip_ratio/high_mean": 0.001119939750424237, + "clip_ratio/low_mean": 0.0006350224448397057, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001754962228005752, + "completions/clipped_ratio": 0.2433035714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3473.0, + "completions/mean_length": 1496.946533203125, + "completions/mean_terminated_length": 661.2625122070312, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 3.6555847185768444, + "grad_norm": 0.3296455144882202, + "learning_rate": 1e-06, + "loss": -0.069, + "num_tokens": 229305646.0, + "reward": 0.4843750298023224, + "reward_std": 0.14563976228237152, + "rewards/verify_math_reward/mean": 0.484375, + "rewards/verify_math_reward/std": 0.5000349283218384, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.00340685666014906, + "clip_ratio/high_mean": 0.0012808764458895894, + "clip_ratio/low_mean": 0.0007630479349245434, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020439243817236274, + "epoch": 3.657917760279965, + "grad_norm": 0.3247814178466797, + "learning_rate": 1e-06, + "loss": -0.0693, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.004076259312569164, + "clip_ratio/high_mean": 0.001424640493496554, + "clip_ratio/low_mean": 0.0010457216612849152, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002470362167514395, + "epoch": 3.6602508019830857, + "grad_norm": 0.27823442220687866, + "learning_rate": 1e-06, + "loss": -0.0695, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.003177139748004265, + "clip_ratio/high_mean": 0.00116790375977871, + "clip_ratio/low_mean": 0.0011514383477333467, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002319342085684184, + "epoch": 3.662583843686206, + "grad_norm": 0.2684524953365326, + "learning_rate": 1e-06, + "loss": -0.0695, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.002009153824474197, + "clip_ratio/high_mean": 0.0007501048912672559, + "clip_ratio/low_mean": 0.00029923092245098815, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010493357949599158, + "completions/clipped_ratio": 0.1495535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3410.0, + "completions/mean_length": 1177.0045166015625, + "completions/mean_terminated_length": 663.6903076171875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 3.664916885389326, + "grad_norm": 0.24503754079341888, + "learning_rate": 1e-06, + "loss": -0.0425, + "num_tokens": 229906562.0, + "reward": 0.6584821939468384, + "reward_std": 0.12223109602928162, + "rewards/verify_math_reward/mean": 0.6584821343421936, + "rewards/verify_math_reward/std": 0.4744836091995239, + "step": 1569 + }, + { + "clip_ratio/high_max": 0.00233134716108907, + "clip_ratio/high_mean": 0.00090423029178055, + "clip_ratio/low_mean": 0.0003734052588697523, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012776355688401964, + "epoch": 3.667249927092447, + "grad_norm": 0.3895801603794098, + "learning_rate": 1e-06, + "loss": -0.0425, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.002474146938766353, + "clip_ratio/high_mean": 0.0009299981356889475, + "clip_ratio/low_mean": 0.0004818480730364172, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001411846220435109, + "epoch": 3.6695829687955674, + "grad_norm": 0.21874651312828064, + "learning_rate": 1e-06, + "loss": -0.0427, + "step": 1571 + }, + { + "clip_ratio/high_max": 0.002170715044485405, + "clip_ratio/high_mean": 0.0007921155311123584, + "clip_ratio/low_mean": 0.0005451086276480055, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013372241410252172, + "epoch": 3.6719160104986877, + "grad_norm": 0.21428361535072327, + "learning_rate": 1e-06, + "loss": -0.0427, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0026036849012598395, + "clip_ratio/high_mean": 0.0010126106444658944, + "clip_ratio/low_mean": 0.0006757626333637745, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016883732459973544, + "completions/clipped_ratio": 0.1785714285714286, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3878.0, + "completions/mean_length": 1322.5179443359375, + "completions/mean_terminated_length": 719.5869750976562, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 3.674249052201808, + "grad_norm": 0.3311329185962677, + "learning_rate": 1e-06, + "loss": -0.0896, + "num_tokens": 230528706.0, + "reward": 0.5580357313156128, + "reward_std": 0.1582677811384201, + "rewards/verify_math_reward/mean": 0.5580357313156128, + "rewards/verify_math_reward/std": 0.49689781665802, + "step": 1573 + }, + { + "clip_ratio/high_max": 0.003360018708917778, + "clip_ratio/high_mean": 0.0012589432990353089, + "clip_ratio/low_mean": 0.0008136436954373494, + "clip_ratio/low_min": 1.4357913642015774e-05, + "clip_ratio/region_mean": 0.0020725869471789338, + "epoch": 3.6765820939049285, + "grad_norm": 0.2822306156158447, + "learning_rate": 1e-06, + "loss": -0.0898, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.0033367287542205304, + "clip_ratio/high_mean": 0.0012043147580698133, + "clip_ratio/low_mean": 0.0009804068076846306, + "clip_ratio/low_min": 2.8715827284031548e-05, + "clip_ratio/region_mean": 0.002184721546655055, + "epoch": 3.678915135608049, + "grad_norm": 0.24652273952960968, + "learning_rate": 1e-06, + "loss": -0.09, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0031179989455267787, + "clip_ratio/high_mean": 0.0011686939578794409, + "clip_ratio/low_mean": 0.001148701980127953, + "clip_ratio/low_min": 1.4357913642015774e-05, + "clip_ratio/region_mean": 0.002317395934369415, + "epoch": 3.6812481773111694, + "grad_norm": 0.22814157605171204, + "learning_rate": 1e-06, + "loss": -0.0901, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0022585531005461235, + "clip_ratio/high_mean": 0.0008721222602616763, + "clip_ratio/low_mean": 0.0006358251857818686, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015079474505910184, + "completions/clipped_ratio": 0.1975446428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4028.0, + "completions/mean_length": 1368.5313720703125, + "completions/mean_terminated_length": 697.095947265625, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 3.68358121901429, + "grad_norm": 0.32418620586395264, + "learning_rate": 1e-06, + "loss": -0.0899, + "num_tokens": 231114142.0, + "reward": 0.520089328289032, + "reward_std": 0.16070912778377533, + "rewards/verify_math_reward/mean": 0.5200892686843872, + "rewards/verify_math_reward/std": 0.4998753070831299, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.0029885207768529654, + "clip_ratio/high_mean": 0.0011393665590730961, + "clip_ratio/low_mean": 0.0008626439630461391, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002002010471187532, + "epoch": 3.6859142607174102, + "grad_norm": 0.2751808166503906, + "learning_rate": 1e-06, + "loss": -0.09, + "step": 1578 + }, + { + "clip_ratio/high_max": 0.003105989428149769, + "clip_ratio/high_mean": 0.0011380136656953255, + "clip_ratio/low_mean": 0.0009521870260869036, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0020902006945107132, + "epoch": 3.688247302420531, + "grad_norm": 0.24647995829582214, + "learning_rate": 1e-06, + "loss": -0.0902, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.0028495530496002175, + "clip_ratio/high_mean": 0.0010594399755063932, + "clip_ratio/low_mean": 0.0010839170718099922, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002143357072782237, + "epoch": 3.690580344123651, + "grad_norm": 0.23350109159946442, + "learning_rate": 1e-06, + "loss": -0.0903, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0023942124462337233, + "clip_ratio/high_mean": 0.0007833131221559597, + "clip_ratio/low_mean": 0.000566296829674684, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013496099709300324, + "completions/clipped_ratio": 0.1975446428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3932.0, + "completions/mean_length": 1337.771240234375, + "completions/mean_terminated_length": 658.7635498046875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 3.6929133858267718, + "grad_norm": 0.3115820586681366, + "learning_rate": 1e-06, + "loss": -0.0788, + "num_tokens": 231676737.0, + "reward": 0.5758928656578064, + "reward_std": 0.1375589668750763, + "rewards/verify_math_reward/mean": 0.5758928656578064, + "rewards/verify_math_reward/std": 0.49448272585868835, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.0033269945488427766, + "clip_ratio/high_mean": 0.001019602606902481, + "clip_ratio/low_mean": 0.0007508156904805219, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00177041827555513, + "epoch": 3.695246427529892, + "grad_norm": 0.2711862623691559, + "learning_rate": 1e-06, + "loss": -0.079, + "step": 1582 + }, + { + "clip_ratio/high_max": 0.003074318909057183, + "clip_ratio/high_mean": 0.0009527142501610797, + "clip_ratio/low_mean": 0.0008890186472854111, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001841732933826279, + "epoch": 3.6975794692330126, + "grad_norm": 0.3059765100479126, + "learning_rate": 1e-06, + "loss": -0.0791, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.0028426043718354777, + "clip_ratio/high_mean": 0.0008659506893309299, + "clip_ratio/low_mean": 0.0010804437115439214, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019463943535811268, + "epoch": 3.699912510936133, + "grad_norm": 0.24489635229110718, + "learning_rate": 1e-06, + "loss": -0.0793, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.002322709835425485, + "clip_ratio/high_mean": 0.0008757356263231486, + "clip_ratio/low_mean": 0.0006089560674809036, + "clip_ratio/low_min": 1.4637002095696516e-05, + "clip_ratio/region_mean": 0.001484691645600833, + "completions/clipped_ratio": 0.1696428571428571, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2399.0, + "completions/mean_length": 1211.009033203125, + "completions/mean_terminated_length": 621.6021728515625, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "epoch": 3.7022455526392535, + "grad_norm": 0.3546612858772278, + "learning_rate": 1e-06, + "loss": -0.0814, + "num_tokens": 232225569.0, + "reward": 0.5703125, + "reward_std": 0.17615394294261932, + "rewards/verify_math_reward/mean": 0.5703125, + "rewards/verify_math_reward/std": 0.49530795216560364, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0026163434667978436, + "clip_ratio/high_mean": 0.0010972397249133792, + "clip_ratio/low_mean": 0.0008886452833394287, + "clip_ratio/low_min": 4.30421805504011e-05, + "clip_ratio/region_mean": 0.0019858849700540304, + "epoch": 3.704578594342374, + "grad_norm": 0.3230176866054535, + "learning_rate": 1e-06, + "loss": -0.0816, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.0028723169307340868, + "clip_ratio/high_mean": 0.0011486682378745172, + "clip_ratio/low_mean": 0.00103460737682326, + "clip_ratio/low_min": 7.179314161476213e-05, + "clip_ratio/region_mean": 0.0021832755810464732, + "epoch": 3.7069116360454943, + "grad_norm": 0.2995162308216095, + "learning_rate": 1e-06, + "loss": -0.0817, + "step": 1587 + }, + { + "clip_ratio/high_max": 0.0025985515967477113, + "clip_ratio/high_mean": 0.0009519480572635075, + "clip_ratio/low_mean": 0.0012239921852597035, + "clip_ratio/low_min": 2.805626172630582e-05, + "clip_ratio/region_mean": 0.0021759402225143276, + "epoch": 3.7092446777486145, + "grad_norm": 0.2810906171798706, + "learning_rate": 1e-06, + "loss": -0.0818, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.002692543566809036, + "clip_ratio/high_mean": 0.0009204212274198653, + "clip_ratio/low_mean": 0.0006138828317716616, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001534304075903492, + "completions/clipped_ratio": 0.2075892857142857, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3768.0, + "completions/mean_length": 1386.204345703125, + "completions/mean_terminated_length": 676.3140869140625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 3.711577719451735, + "grad_norm": 0.32845979928970337, + "learning_rate": 1e-06, + "loss": -0.0701, + "num_tokens": 232796368.0, + "reward": 0.5301339626312256, + "reward_std": 0.14135508239269257, + "rewards/verify_math_reward/mean": 0.5301339030265808, + "rewards/verify_math_reward/std": 0.49936985969543457, + "step": 1589 + }, + { + "clip_ratio/high_max": 0.0029532299013226293, + "clip_ratio/high_mean": 0.0010062977435154608, + "clip_ratio/low_mean": 0.0007842654104024405, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017905630884342827, + "epoch": 3.713910761154856, + "grad_norm": 0.29079484939575195, + "learning_rate": 1e-06, + "loss": -0.0703, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0029964288987684995, + "clip_ratio/high_mean": 0.0010623050002322998, + "clip_ratio/low_mean": 0.0009586893656887696, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002020994397753384, + "epoch": 3.716243802857976, + "grad_norm": 0.2608084976673126, + "learning_rate": 1e-06, + "loss": -0.0704, + "step": 1591 + }, + { + "clip_ratio/high_max": 0.0024345178389921784, + "clip_ratio/high_mean": 0.0009444899878872093, + "clip_ratio/low_mean": 0.0011719619114956004, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002116451876645442, + "epoch": 3.7185768445610963, + "grad_norm": 0.281854510307312, + "learning_rate": 1e-06, + "loss": -0.0705, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.002234755262179533, + "clip_ratio/high_mean": 0.0007076127913023811, + "clip_ratio/low_mean": 0.0004429770510796516, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001150589836470317, + "completions/clipped_ratio": 0.1495535714285714, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3125.0, + "completions/mean_length": 1113.5804443359375, + "completions/mean_terminated_length": 589.1128540039062, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 3.720909886264217, + "grad_norm": 0.2862772047519684, + "learning_rate": 1e-06, + "loss": -0.0584, + "num_tokens": 233325464.0, + "reward": 0.645089328289032, + "reward_std": 0.12062598019838333, + "rewards/verify_math_reward/mean": 0.6450892686843872, + "rewards/verify_math_reward/std": 0.4787535071372986, + "step": 1593 + }, + { + "clip_ratio/high_max": 0.0031345177776529454, + "clip_ratio/high_mean": 0.0010075391801365186, + "clip_ratio/low_mean": 0.0006057004723061254, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016132396885950584, + "epoch": 3.7232429279673376, + "grad_norm": 0.25686419010162354, + "learning_rate": 1e-06, + "loss": -0.0586, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.0028180151057313196, + "clip_ratio/high_mean": 0.0009742606889631134, + "clip_ratio/low_mean": 0.0008117809757095529, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017860416701296344, + "epoch": 3.725575969670458, + "grad_norm": 0.1940649300813675, + "learning_rate": 1e-06, + "loss": -0.0587, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.002606392386951484, + "clip_ratio/high_mean": 0.0008690365175425541, + "clip_ratio/low_mean": 0.0008804468684502353, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017494833955424838, + "epoch": 3.7279090113735784, + "grad_norm": 0.26634952425956726, + "learning_rate": 1e-06, + "loss": -0.0588, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0021255537612887565, + "clip_ratio/high_mean": 0.0009493224206380546, + "clip_ratio/low_mean": 0.0006909248835427206, + "clip_ratio/low_min": 1.2359106221992988e-05, + "clip_ratio/region_mean": 0.0016402473047492094, + "completions/clipped_ratio": 0.1662946428571429, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4079.0, + "completions/mean_length": 1277.224365234375, + "completions/mean_terminated_length": 714.9785766601562, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 3.7302420530766986, + "grad_norm": 0.399694561958313, + "learning_rate": 1e-06, + "loss": -0.0923, + "num_tokens": 233939537.0, + "reward": 0.5602678656578064, + "reward_std": 0.19261160492897034, + "rewards/verify_math_reward/mean": 0.5602678656578064, + "rewards/verify_math_reward/std": 0.4966317117214203, + "step": 1597 + }, + { + "clip_ratio/high_max": 0.0033188991001225077, + "clip_ratio/high_mean": 0.001244036258867709, + "clip_ratio/low_mean": 0.0009532424633107439, + "clip_ratio/low_min": 4.3214809920755215e-05, + "clip_ratio/region_mean": 0.002197278750827536, + "epoch": 3.7325750947798193, + "grad_norm": 0.3414709270000458, + "learning_rate": 1e-06, + "loss": -0.0926, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.0031182927050394937, + "clip_ratio/high_mean": 0.0011967702739639208, + "clip_ratio/low_mean": 0.0010417346588837972, + "clip_ratio/low_min": 5.5573917052242905e-05, + "clip_ratio/region_mean": 0.0022385049087461084, + "epoch": 3.7349081364829395, + "grad_norm": 0.2714030146598816, + "learning_rate": 1e-06, + "loss": -0.0927, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.002907983485783916, + "clip_ratio/high_mean": 0.0012127297868573805, + "clip_ratio/low_mean": 0.001272044322831789, + "clip_ratio/low_min": 3.710917189891916e-05, + "clip_ratio/region_mean": 0.002484774093318265, + "epoch": 3.73724117818606, + "grad_norm": 0.3983018696308136, + "learning_rate": 1e-06, + "loss": -0.0928, + "step": 1600 + }, + { + "epoch": 3.73724117818606, + "step": 1600, + "total_flos": 0.0, + "train_loss": -0.05325791612935007, + "train_runtime": 64519.0326, + "train_samples_per_second": 22.22, + "train_steps_per_second": 0.025 + } + ], + "logging_steps": 1, + "max_steps": 1600, + "num_input_tokens_seen": 233939537, + "num_train_epochs": 4, + "save_steps": 160, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}