{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 14.951603498542275, "eval_steps": 500, "global_step": 1600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2996.0, "completions/mean_length": 577.6875, "completions/mean_terminated_length": 541.9887084960938, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.009329446064139942, "grad_norm": 0.1569121778011322, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 604112.0, "reward": 0.5457589626312256, "reward_std": 0.26444563269615173, "rewards/simpleverify_reward/mean": 0.5457589030265808, "rewards/simpleverify_reward/std": 0.4981798231601715, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3096.0, "completions/mean_length": 595.8671875, "completions/mean_terminated_length": 544.3363647460938, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.018658892128279883, "grad_norm": 0.13906316459178925, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 1225265.0, "reward": 0.4687500298023224, "reward_std": 0.2484123855829239, "rewards/simpleverify_reward/mean": 0.46875, "rewards/simpleverify_reward/std": 0.4993011951446533, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3684.0, "completions/mean_length": 584.3348388671875, "completions/mean_terminated_length": 540.68701171875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 0.027988338192419825, "grad_norm": 0.14353393018245697, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 1847373.0, "reward": 0.5133928656578064, "reward_std": 0.22838753461837769, "rewards/simpleverify_reward/mean": 0.5133928656578064, "rewards/simpleverify_reward/std": 0.500099778175354, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3737.0, "completions/mean_length": 572.2522583007812, "completions/mean_terminated_length": 524.4185791015625, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "epoch": 0.037317784256559766, "grad_norm": 0.14606700837612152, "learning_rate": 1e-06, "loss": 0.0277, "num_tokens": 2439543.0, "reward": 0.5301339626312256, "reward_std": 0.26219862699508667, "rewards/simpleverify_reward/mean": 0.5301339030265808, "rewards/simpleverify_reward/std": 0.49936988949775696, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3241.0, "completions/mean_length": 574.8739013671875, "completions/mean_terminated_length": 535.132080078125, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "epoch": 0.04664723032069971, "grad_norm": 0.16904550790786743, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 3038198.0, "reward": 0.5089285969734192, "reward_std": 0.26486679911613464, "rewards/simpleverify_reward/mean": 0.5089285969734192, "rewards/simpleverify_reward/std": 0.5001994967460632, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3885.0, "completions/mean_length": 624.4710083007812, "completions/mean_terminated_length": 549.2611083984375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.05597667638483965, "grad_norm": 0.12923941016197205, "learning_rate": 1e-06, "loss": 0.0302, "num_tokens": 3688148.0, "reward": 0.5245535969734192, "reward_std": 0.2504050135612488, "rewards/simpleverify_reward/mean": 0.5245535969734192, "rewards/simpleverify_reward/std": 0.4996756911277771, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 580.8984375, "completions/mean_terminated_length": 525.1032104492188, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.0653061224489796, "grad_norm": 0.14436309039592743, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 4307089.0, "reward": 0.5234375, "reward_std": 0.2576860189437866, "rewards/simpleverify_reward/mean": 0.5234375, "rewards/simpleverify_reward/std": 0.49972933530807495, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 612.8114013671875, "completions/mean_terminated_length": 589.3292236328125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.07463556851311953, "grad_norm": 0.12321566045284271, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 4930560.0, "reward": 0.5133928656578064, "reward_std": 0.23840834200382233, "rewards/simpleverify_reward/mean": 0.5133928656578064, "rewards/simpleverify_reward/std": 0.500099778175354, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3697.0, "completions/mean_length": 659.3861694335938, "completions/mean_terminated_length": 564.8004150390625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.08396501457725948, "grad_norm": 0.12559792399406433, "learning_rate": 1e-06, "loss": 0.0335, "num_tokens": 5610346.0, "reward": 0.5133928656578064, "reward_std": 0.24325896799564362, "rewards/simpleverify_reward/mean": 0.5133928656578064, "rewards/simpleverify_reward/std": 0.500099778175354, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 2372.0, "completions/mean_length": 627.857177734375, "completions/mean_terminated_length": 560.78271484375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.09329446064139942, "grad_norm": 0.12554146349430084, "learning_rate": 1e-06, "loss": 0.0343, "num_tokens": 6268146.0, "reward": 0.515625, "reward_std": 0.22474044561386108, "rewards/simpleverify_reward/mean": 0.515625, "rewards/simpleverify_reward/std": 0.5000349283218384, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2469.0, "completions/mean_length": 640.0792846679688, "completions/mean_terminated_length": 577.2443237304688, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.10262390670553936, "grad_norm": 0.1103038340806961, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 6926001.0, "reward": 0.515625, "reward_std": 0.19099508225917816, "rewards/simpleverify_reward/mean": 0.515625, "rewards/simpleverify_reward/std": 0.5000349283218384, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 2645.0, "completions/mean_length": 589.1506958007812, "completions/mean_terminated_length": 517.2562866210938, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.1119533527696793, "grad_norm": 0.1152946799993515, "learning_rate": 1e-06, "loss": 0.0493, "num_tokens": 7556360.0, "reward": 0.546875, "reward_std": 0.17923539876937866, "rewards/simpleverify_reward/mean": 0.546875, "rewards/simpleverify_reward/std": 0.4980759024620056, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2689.0, "completions/mean_length": 630.5792846679688, "completions/mean_terminated_length": 591.4661254882812, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.12128279883381925, "grad_norm": 0.1212453618645668, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 8209351.0, "reward": 0.5613839626312256, "reward_std": 0.23454692959785461, "rewards/simpleverify_reward/mean": 0.5613839030265808, "rewards/simpleverify_reward/std": 0.496494859457016, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3240.0, "completions/mean_length": 558.6763916015625, "completions/mean_terminated_length": 518.751708984375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.1306122448979592, "grad_norm": 0.1304236799478531, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 8806037.0, "reward": 0.551339328289032, "reward_std": 0.22631458938121796, "rewards/simpleverify_reward/mean": 0.5513392686843872, "rewards/simpleverify_reward/std": 0.4976350665092468, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3722.0, "completions/mean_length": 635.450927734375, "completions/mean_terminated_length": 580.5215454101562, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.13994169096209913, "grad_norm": 0.10717866569757462, "learning_rate": 1e-06, "loss": 0.0281, "num_tokens": 9468865.0, "reward": 0.5502232313156128, "reward_std": 0.1979040503501892, "rewards/simpleverify_reward/mean": 0.5502232313156128, "rewards/simpleverify_reward/std": 0.49774909019470215, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3161.0, "completions/mean_length": 612.3002319335938, "completions/mean_terminated_length": 565.0101928710938, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.14927113702623906, "grad_norm": 0.11831918358802795, "learning_rate": 1e-06, "loss": 0.0503, "num_tokens": 10108950.0, "reward": 0.5301339626312256, "reward_std": 0.2140667736530304, "rewards/simpleverify_reward/mean": 0.5301339030265808, "rewards/simpleverify_reward/std": 0.49936985969543457, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2502.0, "completions/mean_length": 584.265625, "completions/mean_terminated_length": 556.6141967773438, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.158600583090379, "grad_norm": 0.12247284501791, "learning_rate": 1e-06, "loss": 0.04, "num_tokens": 10718076.0, "reward": 0.5647321939468384, "reward_std": 0.21984775364398956, "rewards/simpleverify_reward/mean": 0.5647321343421936, "rewards/simpleverify_reward/std": 0.49606895446777344, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2068.0, "completions/mean_length": 653.6205444335938, "completions/mean_terminated_length": 575.02734375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.16793002915451896, "grad_norm": 0.10902804136276245, "learning_rate": 1e-06, "loss": 0.0274, "num_tokens": 11384952.0, "reward": 0.5412946939468384, "reward_std": 0.21681120991706848, "rewards/simpleverify_reward/mean": 0.5412946343421936, "rewards/simpleverify_reward/std": 0.49857014417648315, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3921.0, "completions/mean_length": 596.2098388671875, "completions/mean_terminated_length": 556.7088012695312, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.1772594752186589, "grad_norm": 0.14531929790973663, "learning_rate": 1e-06, "loss": 0.0293, "num_tokens": 12013612.0, "reward": 0.6071428656578064, "reward_std": 0.29692038893699646, "rewards/simpleverify_reward/mean": 0.6071428656578064, "rewards/simpleverify_reward/std": 0.48865827918052673, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3325.0, "completions/mean_length": 578.9017944335938, "completions/mean_terminated_length": 531.1583862304688, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.18658892128279883, "grad_norm": 0.12044237554073334, "learning_rate": 1e-06, "loss": 0.0469, "num_tokens": 12623476.0, "reward": 0.6395089626312256, "reward_std": 0.22364641726016998, "rewards/simpleverify_reward/mean": 0.6395089030265808, "rewards/simpleverify_reward/std": 0.4804111421108246, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3899.0, "completions/mean_length": 585.6517944335938, "completions/mean_terminated_length": 525.88427734375, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.19591836734693877, "grad_norm": 0.1342894285917282, "learning_rate": 1e-06, "loss": 0.044, "num_tokens": 13230412.0, "reward": 0.5870535969734192, "reward_std": 0.19066019356250763, "rewards/simpleverify_reward/mean": 0.5870535969734192, "rewards/simpleverify_reward/std": 0.49263837933540344, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 644.2879638671875, "completions/mean_terminated_length": 593.469970703125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.20524781341107873, "grad_norm": 0.11594964563846588, "learning_rate": 1e-06, "loss": 0.0232, "num_tokens": 13903126.0, "reward": 0.5457589626312256, "reward_std": 0.20606262981891632, "rewards/simpleverify_reward/mean": 0.5457589030265808, "rewards/simpleverify_reward/std": 0.4981797933578491, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 632.09375, "completions/mean_terminated_length": 569.1136474609375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.21457725947521866, "grad_norm": 0.11405224353075027, "learning_rate": 1e-06, "loss": 0.0361, "num_tokens": 14553818.0, "reward": 0.625, "reward_std": 0.18791212141513824, "rewards/simpleverify_reward/mean": 0.625, "rewards/simpleverify_reward/std": 0.48439329862594604, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2792.0, "completions/mean_length": 617.3460083007812, "completions/mean_terminated_length": 574.1084594726562, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.2239067055393586, "grad_norm": 0.11643239855766296, "learning_rate": 1e-06, "loss": 0.0351, "num_tokens": 15191256.0, "reward": 0.5055803656578064, "reward_std": 0.226087749004364, "rewards/simpleverify_reward/mean": 0.5055803656578064, "rewards/simpleverify_reward/std": 0.5002480745315552, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3140.0, "completions/mean_length": 663.8170166015625, "completions/mean_terminated_length": 597.43798828125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.23323615160349853, "grad_norm": 0.10811079293489456, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 15871268.0, "reward": 0.5234375, "reward_std": 0.18731407821178436, "rewards/simpleverify_reward/mean": 0.5234375, "rewards/simpleverify_reward/std": 0.49972933530807495, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3352.0, "completions/mean_length": 598.3839721679688, "completions/mean_terminated_length": 550.905029296875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.2425655976676385, "grad_norm": 0.11829746514558792, "learning_rate": 1e-06, "loss": 0.0295, "num_tokens": 16494892.0, "reward": 0.5145089626312256, "reward_std": 0.19779525697231293, "rewards/simpleverify_reward/mean": 0.5145089030265808, "rewards/simpleverify_reward/std": 0.5000685453414917, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3515.0, "completions/mean_length": 593.640625, "completions/mean_terminated_length": 546.0972900390625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.2518950437317784, "grad_norm": 0.12933631241321564, "learning_rate": 1e-06, "loss": 0.0268, "num_tokens": 17114914.0, "reward": 0.5915178656578064, "reward_std": 0.21545250713825226, "rewards/simpleverify_reward/mean": 0.5915178656578064, "rewards/simpleverify_reward/std": 0.49182769656181335, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3950.0, "completions/mean_length": 589.1138916015625, "completions/mean_terminated_length": 557.520263671875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.2612244897959184, "grad_norm": 0.12451650202274323, "learning_rate": 1e-06, "loss": 0.0275, "num_tokens": 17732752.0, "reward": 0.5580357313156128, "reward_std": 0.21481674909591675, "rewards/simpleverify_reward/mean": 0.5580357313156128, "rewards/simpleverify_reward/std": 0.49689781665802, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.024553571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3802.0, "completions/mean_length": 648.9152221679688, "completions/mean_terminated_length": 562.1464233398438, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.2705539358600583, "grad_norm": 0.115830197930336, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 18403164.0, "reward": 0.5100446939468384, "reward_std": 0.20899637043476105, "rewards/simpleverify_reward/mean": 0.5100446343421936, "rewards/simpleverify_reward/std": 0.5001782774925232, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2972.0, "completions/mean_length": 572.5145263671875, "completions/mean_terminated_length": 524.6843872070312, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.27988338192419826, "grad_norm": 0.13132816553115845, "learning_rate": 1e-06, "loss": 0.0186, "num_tokens": 19011465.0, "reward": 0.5647321939468384, "reward_std": 0.20336014032363892, "rewards/simpleverify_reward/mean": 0.5647321343421936, "rewards/simpleverify_reward/std": 0.49606895446777344, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2358.0, "completions/mean_length": 646.703125, "completions/mean_terminated_length": 611.70458984375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.2892128279883382, "grad_norm": 0.11632570624351501, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 19675791.0, "reward": 0.53125, "reward_std": 0.21184366941452026, "rewards/simpleverify_reward/mean": 0.53125, "rewards/simpleverify_reward/std": 0.4993011951446533, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3225.0, "completions/mean_length": 583.90625, "completions/mean_terminated_length": 564.1975708007812, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.29854227405247813, "grad_norm": 0.1387438178062439, "learning_rate": 1e-06, "loss": 0.0347, "num_tokens": 20289811.0, "reward": 0.5613839626312256, "reward_std": 0.23698687553405762, "rewards/simpleverify_reward/mean": 0.5613839030265808, "rewards/simpleverify_reward/std": 0.496494859457016, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 567.3839721679688, "completions/mean_terminated_length": 519.4841918945312, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.30787172011661806, "grad_norm": 0.12350305914878845, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 20884723.0, "reward": 0.5569196939468384, "reward_std": 0.18414919078350067, "rewards/simpleverify_reward/mean": 0.5569196343421936, "rewards/simpleverify_reward/std": 0.49702703952789307, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3678.0, "completions/mean_length": 639.0167846679688, "completions/mean_terminated_length": 560.0901489257812, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.317201166180758, "grad_norm": 0.11652282625436783, "learning_rate": 1e-06, "loss": 0.036, "num_tokens": 21540674.0, "reward": 0.5111607313156128, "reward_std": 0.21872234344482422, "rewards/simpleverify_reward/mean": 0.5111607313156128, "rewards/simpleverify_reward/std": 0.5001546144485474, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3409.0, "completions/mean_length": 624.9140625, "completions/mean_terminated_length": 577.7952880859375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.32653061224489793, "grad_norm": 0.11031773686408997, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 22184253.0, "reward": 0.5926339626312256, "reward_std": 0.18182799220085144, "rewards/simpleverify_reward/mean": 0.5926339030265808, "rewards/simpleverify_reward/std": 0.49161845445632935, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3563.0, "completions/mean_length": 629.685302734375, "completions/mean_terminated_length": 578.6522827148438, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.3358600583090379, "grad_norm": 0.11824522912502289, "learning_rate": 1e-06, "loss": 0.0617, "num_tokens": 22841275.0, "reward": 0.543526828289032, "reward_std": 0.21507567167282104, "rewards/simpleverify_reward/mean": 0.5435267686843872, "rewards/simpleverify_reward/std": 0.49838000535964966, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2755.0, "completions/mean_length": 589.0, "completions/mean_terminated_length": 549.4176025390625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.34518950437317786, "grad_norm": 0.12709525227546692, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 23445747.0, "reward": 0.5323660969734192, "reward_std": 0.22007739543914795, "rewards/simpleverify_reward/mean": 0.5323660969734192, "rewards/simpleverify_reward/std": 0.4992299973964691, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3441.0, "completions/mean_length": 628.7745971679688, "completions/mean_terminated_length": 545.5611572265625, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.3545189504373178, "grad_norm": 0.12149134278297424, "learning_rate": 1e-06, "loss": 0.0159, "num_tokens": 24098465.0, "reward": 0.5546875, "reward_std": 0.21706941723823547, "rewards/simpleverify_reward/mean": 0.5546875, "rewards/simpleverify_reward/std": 0.4972778558731079, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3823.0, "completions/mean_length": 594.2924194335938, "completions/mean_terminated_length": 554.769775390625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.3638483965014577, "grad_norm": 0.1358894407749176, "learning_rate": 1e-06, "loss": 0.04, "num_tokens": 24726815.0, "reward": 0.5368303656578064, "reward_std": 0.22515526413917542, "rewards/simpleverify_reward/mean": 0.5368303656578064, "rewards/simpleverify_reward/std": 0.49892017245292664, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2078.0, "completions/mean_length": 563.5234375, "completions/mean_terminated_length": 539.708984375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.37317784256559766, "grad_norm": 0.11929410696029663, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 25334084.0, "reward": 0.5334821939468384, "reward_std": 0.19798001646995544, "rewards/simpleverify_reward/mean": 0.5334821343421936, "rewards/simpleverify_reward/std": 0.49915632605552673, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.024553571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 683.958740234375, "completions/mean_terminated_length": 598.0720825195312, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.3825072886297376, "grad_norm": 0.12098333984613419, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 26048695.0, "reward": 0.504464328289032, "reward_std": 0.22571726143360138, "rewards/simpleverify_reward/mean": 0.5044642686843872, "rewards/simpleverify_reward/std": 0.5002593398094177, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 2826.0, "completions/mean_length": 619.5892944335938, "completions/mean_terminated_length": 548.3189086914062, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.39183673469387753, "grad_norm": 0.11403215676546097, "learning_rate": 1e-06, "loss": 0.0551, "num_tokens": 26695047.0, "reward": 0.5256696939468384, "reward_std": 0.18855221569538116, "rewards/simpleverify_reward/mean": 0.5256696343421936, "rewards/simpleverify_reward/std": 0.4996195435523987, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4034.0, "completions/mean_length": 571.2176513671875, "completions/mean_terminated_length": 543.4634399414062, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.40116618075801747, "grad_norm": 0.12272074073553085, "learning_rate": 1e-06, "loss": 0.0208, "num_tokens": 27292994.0, "reward": 0.5491071939468384, "reward_std": 0.21327608823776245, "rewards/simpleverify_reward/mean": 0.5491071343421936, "rewards/simpleverify_reward/std": 0.49786055088043213, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 519.310302734375, "completions/mean_terminated_length": 495.19775390625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.41049562682215746, "grad_norm": 0.12676896154880524, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 27844488.0, "reward": 0.625, "reward_std": 0.18735052645206451, "rewards/simpleverify_reward/mean": 0.625, "rewards/simpleverify_reward/std": 0.48439329862594604, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 2723.0, "completions/mean_length": 630.2745971679688, "completions/mean_terminated_length": 563.246826171875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.4198250728862974, "grad_norm": 0.12029924243688583, "learning_rate": 1e-06, "loss": 0.0348, "num_tokens": 28502606.0, "reward": 0.5881696939468384, "reward_std": 0.225113183259964, "rewards/simpleverify_reward/mean": 0.5881696343421936, "rewards/simpleverify_reward/std": 0.4924395978450775, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3933.0, "completions/mean_length": 668.0703125, "completions/mean_terminated_length": 621.537353515625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.4291545189504373, "grad_norm": 0.10895045846700668, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 29189645.0, "reward": 0.5390625, "reward_std": 0.19520561397075653, "rewards/simpleverify_reward/mean": 0.5390625, "rewards/simpleverify_reward/std": 0.4987502098083496, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2642.0, "completions/mean_length": 630.3114013671875, "completions/mean_terminated_length": 583.265869140625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.43848396501457726, "grad_norm": 0.12197647988796234, "learning_rate": 1e-06, "loss": 0.053, "num_tokens": 29839300.0, "reward": 0.5703125, "reward_std": 0.23645879328250885, "rewards/simpleverify_reward/mean": 0.5703125, "rewards/simpleverify_reward/std": 0.49530795216560364, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3868.0, "completions/mean_length": 599.068115234375, "completions/mean_terminated_length": 563.5862426757812, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.4478134110787172, "grad_norm": 0.1221025139093399, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 30470713.0, "reward": 0.5189732313156128, "reward_std": 0.18633092939853668, "rewards/simpleverify_reward/mean": 0.5189732313156128, "rewards/simpleverify_reward/std": 0.49991893768310547, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3600.0, "completions/mean_length": 606.9442138671875, "completions/mean_terminated_length": 563.577392578125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.45714285714285713, "grad_norm": 0.11962051689624786, "learning_rate": 1e-06, "loss": 0.0174, "num_tokens": 31097551.0, "reward": 0.4988839626312256, "reward_std": 0.20827393233776093, "rewards/simpleverify_reward/mean": 0.4988839328289032, "rewards/simpleverify_reward/std": 0.5002779960632324, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029017857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2910.0, "completions/mean_length": 650.4152221679688, "completions/mean_terminated_length": 547.4436645507812, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.46647230320699706, "grad_norm": 0.1175539568066597, "learning_rate": 1e-06, "loss": 0.0483, "num_tokens": 31782859.0, "reward": 0.5424107313156128, "reward_std": 0.2198156714439392, "rewards/simpleverify_reward/mean": 0.5424107313156128, "rewards/simpleverify_reward/std": 0.4984763264656067, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 630.0089721679688, "completions/mean_terminated_length": 554.9190063476562, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.47580174927113705, "grad_norm": 0.1235552653670311, "learning_rate": 1e-06, "loss": 0.0187, "num_tokens": 32428555.0, "reward": 0.5859375, "reward_std": 0.23457902669906616, "rewards/simpleverify_reward/mean": 0.5859375, "rewards/simpleverify_reward/std": 0.4928344786167145, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3703.0, "completions/mean_length": 582.958740234375, "completions/mean_terminated_length": 539.2938232421875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.485131195335277, "grad_norm": 0.11573449522256851, "learning_rate": 1e-06, "loss": 0.0243, "num_tokens": 33030574.0, "reward": 0.625, "reward_std": 0.1975695639848709, "rewards/simpleverify_reward/mean": 0.625, "rewards/simpleverify_reward/std": 0.48439329862594604, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3344.0, "completions/mean_length": 599.9520263671875, "completions/mean_terminated_length": 540.427978515625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.4944606413994169, "grad_norm": 0.12617427110671997, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 33653883.0, "reward": 0.5647321939468384, "reward_std": 0.20162460207939148, "rewards/simpleverify_reward/mean": 0.5647321343421936, "rewards/simpleverify_reward/std": 0.49606895446777344, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3807.0, "completions/mean_length": 551.0167846679688, "completions/mean_terminated_length": 506.9548034667969, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.5037900874635568, "grad_norm": 0.1326778382062912, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 34237434.0, "reward": 0.5725446939468384, "reward_std": 0.2153034806251526, "rewards/simpleverify_reward/mean": 0.5725446343421936, "rewards/simpleverify_reward/std": 0.49498558044433594, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3954.0, "completions/mean_length": 544.6830444335938, "completions/mean_terminated_length": 516.7199096679688, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.5131195335276968, "grad_norm": 0.12569890916347504, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 34819862.0, "reward": 0.5848214626312256, "reward_std": 0.19186340272426605, "rewards/simpleverify_reward/mean": 0.5848214030265808, "rewards/simpleverify_reward/std": 0.49302801489830017, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3329.0, "completions/mean_length": 618.8002319335938, "completions/mean_terminated_length": 555.578369140625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.5224489795918368, "grad_norm": 0.11914178729057312, "learning_rate": 1e-06, "loss": 0.0246, "num_tokens": 35465715.0, "reward": 0.5133928656578064, "reward_std": 0.20260311663150787, "rewards/simpleverify_reward/mean": 0.5133928656578064, "rewards/simpleverify_reward/std": 0.500099778175354, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4030.0, "completions/mean_length": 579.9408569335938, "completions/mean_terminated_length": 520.0760498046875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.5317784256559767, "grad_norm": 0.12256541103124619, "learning_rate": 1e-06, "loss": 0.0273, "num_tokens": 36072094.0, "reward": 0.578125, "reward_std": 0.2000095248222351, "rewards/simpleverify_reward/mean": 0.578125, "rewards/simpleverify_reward/std": 0.4941346049308777, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 605.4285888671875, "completions/mean_terminated_length": 545.9977416992188, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.5411078717201167, "grad_norm": 0.13024821877479553, "learning_rate": 1e-06, "loss": 0.0587, "num_tokens": 36704422.0, "reward": 0.5580357313156128, "reward_std": 0.2245907187461853, "rewards/simpleverify_reward/mean": 0.5580357313156128, "rewards/simpleverify_reward/std": 0.49689781665802, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3938.0, "completions/mean_length": 565.739990234375, "completions/mean_terminated_length": 529.919921875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.5504373177842565, "grad_norm": 0.12135731428861618, "learning_rate": 1e-06, "loss": 0.0159, "num_tokens": 37305469.0, "reward": 0.5915178656578064, "reward_std": 0.17048172652721405, "rewards/simpleverify_reward/mean": 0.5915178656578064, "rewards/simpleverify_reward/std": 0.49182766675949097, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 570.9375, "completions/mean_terminated_length": 527.1231689453125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.5597667638483965, "grad_norm": 0.11783162504434586, "learning_rate": 1e-06, "loss": 0.0202, "num_tokens": 37902037.0, "reward": 0.5948660969734192, "reward_std": 0.1698751002550125, "rewards/simpleverify_reward/mean": 0.5948660969734192, "rewards/simpleverify_reward/std": 0.49119213223457336, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0323660714285714, "completions/max_length": 4096.0, "completions/max_terminated_length": 3660.0, "completions/mean_length": 702.7522583007812, "completions/mean_terminated_length": 589.2525634765625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.5690962099125364, "grad_norm": 0.11599902808666229, "learning_rate": 1e-06, "loss": 0.0438, "num_tokens": 38615727.0, "reward": 0.5368303656578064, "reward_std": 0.22485104203224182, "rewards/simpleverify_reward/mean": 0.5368303656578064, "rewards/simpleverify_reward/std": 0.49892017245292664, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3937.0, "completions/mean_length": 661.450927734375, "completions/mean_terminated_length": 595.026123046875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.5784256559766764, "grad_norm": 0.12406815588474274, "learning_rate": 1e-06, "loss": 0.0387, "num_tokens": 39292355.0, "reward": 0.53125, "reward_std": 0.23086097836494446, "rewards/simpleverify_reward/mean": 0.53125, "rewards/simpleverify_reward/std": 0.4993011951446533, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3837.0, "completions/mean_length": 600.5123291015625, "completions/mean_terminated_length": 540.9977416992188, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.5877551020408164, "grad_norm": 0.12974587082862854, "learning_rate": 1e-06, "loss": 0.0366, "num_tokens": 39909070.0, "reward": 0.5993303656578064, "reward_std": 0.20951178669929504, "rewards/simpleverify_reward/mean": 0.5993303656578064, "rewards/simpleverify_reward/std": 0.49030786752700806, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3879.0, "completions/mean_length": 612.0435791015625, "completions/mean_terminated_length": 560.7508544921875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.5970845481049563, "grad_norm": 0.13096298277378082, "learning_rate": 1e-06, "loss": 0.0367, "num_tokens": 40547229.0, "reward": 0.6116071939468384, "reward_std": 0.22500371932983398, "rewards/simpleverify_reward/mean": 0.6116071343421936, "rewards/simpleverify_reward/std": 0.48765692114830017, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3510.0, "completions/mean_length": 621.9967041015625, "completions/mean_terminated_length": 578.8169555664062, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.6064139941690962, "grad_norm": 0.11201366782188416, "learning_rate": 1e-06, "loss": 0.0351, "num_tokens": 41202218.0, "reward": 0.5424107313156128, "reward_std": 0.18272772431373596, "rewards/simpleverify_reward/mean": 0.5424107313156128, "rewards/simpleverify_reward/std": 0.4984763264656067, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 578.9765625, "completions/mean_terminated_length": 527.197021484375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.6157434402332361, "grad_norm": 0.11020374298095703, "learning_rate": 1e-06, "loss": 0.0334, "num_tokens": 41819517.0, "reward": 0.59375, "reward_std": 0.172694131731987, "rewards/simpleverify_reward/mean": 0.59375, "rewards/simpleverify_reward/std": 0.4914066195487976, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3766.0, "completions/mean_length": 701.2756958007812, "completions/mean_terminated_length": 635.6211547851562, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.6250728862973761, "grad_norm": 0.10032541304826736, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 42543532.0, "reward": 0.520089328289032, "reward_std": 0.18039767444133759, "rewards/simpleverify_reward/mean": 0.5200892686843872, "rewards/simpleverify_reward/std": 0.4998753070831299, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3189.0, "completions/mean_length": 620.6752319335938, "completions/mean_terminated_length": 565.5113525390625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.634402332361516, "grad_norm": 0.12269850075244904, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 43183281.0, "reward": 0.5535714626312256, "reward_std": 0.1859925240278244, "rewards/simpleverify_reward/mean": 0.5535714030265808, "rewards/simpleverify_reward/std": 0.4973995089530945, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0279017857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 2996.0, "completions/mean_length": 685.7667846679688, "completions/mean_terminated_length": 587.884033203125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.643731778425656, "grad_norm": 0.12374056875705719, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 43888096.0, "reward": 0.546875, "reward_std": 0.24593418836593628, "rewards/simpleverify_reward/mean": 0.546875, "rewards/simpleverify_reward/std": 0.4980759024620056, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3184.0, "completions/mean_length": 645.2779541015625, "completions/mean_terminated_length": 590.5045166015625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.6530612244897959, "grad_norm": 0.12285792827606201, "learning_rate": 1e-06, "loss": 0.0321, "num_tokens": 44556305.0, "reward": 0.5290178656578064, "reward_std": 0.244048073887825, "rewards/simpleverify_reward/mean": 0.5290178656578064, "rewards/simpleverify_reward/std": 0.49943602085113525, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 602.404052734375, "completions/mean_terminated_length": 570.93017578125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.6623906705539359, "grad_norm": 0.11323628574609756, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 45195643.0, "reward": 0.543526828289032, "reward_std": 0.18632951378822327, "rewards/simpleverify_reward/mean": 0.5435267686843872, "rewards/simpleverify_reward/std": 0.49838000535964966, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4007.0, "completions/mean_length": 660.3359375, "completions/mean_terminated_length": 585.903076171875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.6717201166180758, "grad_norm": 0.12171682715415955, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 45884744.0, "reward": 0.504464328289032, "reward_std": 0.2113219052553177, "rewards/simpleverify_reward/mean": 0.5044642686843872, "rewards/simpleverify_reward/std": 0.5002593398094177, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3743.0, "completions/mean_length": 632.794677734375, "completions/mean_terminated_length": 577.8231201171875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.6810495626822157, "grad_norm": 0.1217542216181755, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 46548504.0, "reward": 0.5602678656578064, "reward_std": 0.17356246709823608, "rewards/simpleverify_reward/mean": 0.5602678656578064, "rewards/simpleverify_reward/std": 0.4966317415237427, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4016.0, "completions/mean_length": 651.341552734375, "completions/mean_terminated_length": 600.6273803710938, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.6903790087463557, "grad_norm": 0.1112971380352974, "learning_rate": 1e-06, "loss": 0.0394, "num_tokens": 47221810.0, "reward": 0.5636160969734192, "reward_std": 0.19035135209560394, "rewards/simpleverify_reward/mean": 0.5636160969734192, "rewards/simpleverify_reward/std": 0.49621346592903137, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2989.0, "completions/mean_length": 654.5569458007812, "completions/mean_terminated_length": 579.9988403320312, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.6997084548104956, "grad_norm": 0.1100168228149414, "learning_rate": 1e-06, "loss": 0.0319, "num_tokens": 47893357.0, "reward": 0.5290178656578064, "reward_std": 0.175893634557724, "rewards/simpleverify_reward/mean": 0.5290178656578064, "rewards/simpleverify_reward/std": 0.49943602085113525, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3839.0, "completions/mean_length": 649.1942138671875, "completions/mean_terminated_length": 610.2911987304688, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.7090379008746356, "grad_norm": 0.13102756440639496, "learning_rate": 1e-06, "loss": 0.0207, "num_tokens": 48568275.0, "reward": 0.5256696939468384, "reward_std": 0.23747432231903076, "rewards/simpleverify_reward/mean": 0.5256696343421936, "rewards/simpleverify_reward/std": 0.4996195137500763, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.030133928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3804.0, "completions/mean_length": 658.3660888671875, "completions/mean_terminated_length": 551.55810546875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.7183673469387755, "grad_norm": 0.13023999333381653, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 49249795.0, "reward": 0.5390625, "reward_std": 0.22807759046554565, "rewards/simpleverify_reward/mean": 0.5390625, "rewards/simpleverify_reward/std": 0.4987502098083496, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 593.0848388671875, "completions/mean_terminated_length": 553.5485229492188, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.7276967930029155, "grad_norm": 0.12320899218320847, "learning_rate": 1e-06, "loss": 0.0165, "num_tokens": 49867111.0, "reward": 0.5892857313156128, "reward_std": 0.18911784887313843, "rewards/simpleverify_reward/mean": 0.5892857313156128, "rewards/simpleverify_reward/std": 0.49223825335502625, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2894.0, "completions/mean_length": 598.8460083007812, "completions/mean_terminated_length": 543.3356323242188, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.7370262390670554, "grad_norm": 0.12949274480342865, "learning_rate": 1e-06, "loss": 0.034, "num_tokens": 50492765.0, "reward": 0.5870535969734192, "reward_std": 0.2266918420791626, "rewards/simpleverify_reward/mean": 0.5870535969734192, "rewards/simpleverify_reward/std": 0.49263834953308105, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3942.0, "completions/mean_length": 559.724365234375, "completions/mean_terminated_length": 527.8660278320312, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.7463556851311953, "grad_norm": 0.14369846880435944, "learning_rate": 1e-06, "loss": 0.0369, "num_tokens": 51084326.0, "reward": 0.6227678656578064, "reward_std": 0.2460034042596817, "rewards/simpleverify_reward/mean": 0.6227678656578064, "rewards/simpleverify_reward/std": 0.4849644899368286, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2363.0, "completions/mean_length": 637.3928833007812, "completions/mean_terminated_length": 586.473388671875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.7556851311953353, "grad_norm": 0.12238933145999908, "learning_rate": 1e-06, "loss": 0.0287, "num_tokens": 51751246.0, "reward": 0.5580357313156128, "reward_std": 0.21642820537090302, "rewards/simpleverify_reward/mean": 0.5580357313156128, "rewards/simpleverify_reward/std": 0.49689781665802, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3551.0, "completions/mean_length": 646.958740234375, "completions/mean_terminated_length": 580.253662109375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.7650145772594752, "grad_norm": 0.11870317906141281, "learning_rate": 1e-06, "loss": 0.0201, "num_tokens": 52425089.0, "reward": 0.574776828289032, "reward_std": 0.20054084062576294, "rewards/simpleverify_reward/mean": 0.5747767686843872, "rewards/simpleverify_reward/std": 0.49465295672416687, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 584.2879638671875, "completions/mean_terminated_length": 556.63671875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.7743440233236152, "grad_norm": 0.11835742741823196, "learning_rate": 1e-06, "loss": 0.023, "num_tokens": 53033827.0, "reward": 0.6361607313156128, "reward_std": 0.21586939692497253, "rewards/simpleverify_reward/mean": 0.6361607313156128, "rewards/simpleverify_reward/std": 0.4813718795776367, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3632.0, "completions/mean_length": 650.4598388671875, "completions/mean_terminated_length": 587.8135986328125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.7836734693877551, "grad_norm": 0.1102403774857521, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 53712255.0, "reward": 0.4765625298023224, "reward_std": 0.18017829954624176, "rewards/simpleverify_reward/mean": 0.4765625, "rewards/simpleverify_reward/std": 0.49972933530807495, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3609.0, "completions/mean_length": 611.9765625, "completions/mean_terminated_length": 552.6572265625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.793002915451895, "grad_norm": 0.12602129578590393, "learning_rate": 1e-06, "loss": 0.0315, "num_tokens": 54350314.0, "reward": 0.5647321939468384, "reward_std": 0.21869054436683655, "rewards/simpleverify_reward/mean": 0.5647321343421936, "rewards/simpleverify_reward/std": 0.49606895446777344, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 4007.0, "completions/mean_length": 587.724365234375, "completions/mean_terminated_length": 571.9921875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.8023323615160349, "grad_norm": 0.11219862103462219, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 54963779.0, "reward": 0.598214328289032, "reward_std": 0.1873914748430252, "rewards/simpleverify_reward/mean": 0.5982142686843872, "rewards/simpleverify_reward/std": 0.49053290486335754, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3637.0, "completions/mean_length": 566.208740234375, "completions/mean_terminated_length": 530.3934326171875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.8116618075801749, "grad_norm": 0.12410742044448853, "learning_rate": 1e-06, "loss": 0.0283, "num_tokens": 55551766.0, "reward": 0.6049107313156128, "reward_std": 0.19643910229206085, "rewards/simpleverify_reward/mean": 0.6049107313156128, "rewards/simpleverify_reward/std": 0.48914292454719543, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2416.0, "completions/mean_length": 569.9152221679688, "completions/mean_terminated_length": 534.1375122070312, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.8209912536443149, "grad_norm": 0.12352431565523148, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 56142482.0, "reward": 0.6328125, "reward_std": 0.18682058155536652, "rewards/simpleverify_reward/mean": 0.6328125, "rewards/simpleverify_reward/std": 0.48230743408203125, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3219.0, "completions/mean_length": 638.3225708007812, "completions/mean_terminated_length": 579.4517822265625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8303206997084548, "grad_norm": 0.12363302707672119, "learning_rate": 1e-06, "loss": 0.0546, "num_tokens": 56816267.0, "reward": 0.5412946939468384, "reward_std": 0.21406814455986023, "rewards/simpleverify_reward/mean": 0.5412946343421936, "rewards/simpleverify_reward/std": 0.49857014417648315, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2342.0, "completions/mean_length": 603.1295166015625, "completions/mean_terminated_length": 567.6887817382812, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.8396501457725948, "grad_norm": 0.11528212577104568, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 57444223.0, "reward": 0.5390625, "reward_std": 0.18640761077404022, "rewards/simpleverify_reward/mean": 0.5390625, "rewards/simpleverify_reward/std": 0.4987502098083496, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3980.0, "completions/mean_length": 603.4699096679688, "completions/mean_terminated_length": 552.0509643554688, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.8489795918367347, "grad_norm": 0.14152829349040985, "learning_rate": 1e-06, "loss": 0.0443, "num_tokens": 58072892.0, "reward": 0.5848214626312256, "reward_std": 0.22451403737068176, "rewards/simpleverify_reward/mean": 0.5848214030265808, "rewards/simpleverify_reward/std": 0.49302801489830017, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3324.0, "completions/mean_length": 623.396240234375, "completions/mean_terminated_length": 564.2713012695312, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.8583090379008746, "grad_norm": 0.12112051993608475, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 58718535.0, "reward": 0.6116071939468384, "reward_std": 0.19125612080097198, "rewards/simpleverify_reward/mean": 0.6116071343421936, "rewards/simpleverify_reward/std": 0.4876568913459778, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 612.1607666015625, "completions/mean_terminated_length": 544.78271484375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.8676384839650145, "grad_norm": 0.12027369439601898, "learning_rate": 1e-06, "loss": 0.034, "num_tokens": 59364567.0, "reward": 0.5625, "reward_std": 0.24116164445877075, "rewards/simpleverify_reward/mean": 0.5625, "rewards/simpleverify_reward/std": 0.49635544419288635, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3712.0, "completions/mean_length": 597.9230346679688, "completions/mean_terminated_length": 566.4088134765625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.8769679300291545, "grad_norm": 0.11804047971963882, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 59977058.0, "reward": 0.5970982313156128, "reward_std": 0.1844968944787979, "rewards/simpleverify_reward/mean": 0.5970982313156128, "rewards/simpleverify_reward/std": 0.4907552897930145, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4038.0, "completions/mean_length": 630.841552734375, "completions/mean_terminated_length": 603.5568237304688, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.8862973760932945, "grad_norm": 0.12228105962276459, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 60629788.0, "reward": 0.546875, "reward_std": 0.19734950363636017, "rewards/simpleverify_reward/mean": 0.546875, "rewards/simpleverify_reward/std": 0.4980759024620056, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 600.6785888671875, "completions/mean_terminated_length": 553.2307739257812, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.8956268221574344, "grad_norm": 0.12868352234363556, "learning_rate": 1e-06, "loss": 0.041, "num_tokens": 61251452.0, "reward": 0.578125, "reward_std": 0.2503523826599121, "rewards/simpleverify_reward/mean": 0.578125, "rewards/simpleverify_reward/std": 0.4941346049308777, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2348.0, "completions/mean_length": 615.6808471679688, "completions/mean_terminated_length": 564.441650390625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.9049562682215744, "grad_norm": 0.12582844495773315, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 61894398.0, "reward": 0.5647321939468384, "reward_std": 0.21891415119171143, "rewards/simpleverify_reward/mean": 0.5647321343421936, "rewards/simpleverify_reward/std": 0.49606895446777344, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 690.911865234375, "completions/mean_terminated_length": 621.1036987304688, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.9142857142857143, "grad_norm": 0.10801351070404053, "learning_rate": 1e-06, "loss": 0.0256, "num_tokens": 62599383.0, "reward": 0.5133928656578064, "reward_std": 0.19990073144435883, "rewards/simpleverify_reward/mean": 0.5133928656578064, "rewards/simpleverify_reward/std": 0.500099778175354, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 663.3292846679688, "completions/mean_terminated_length": 580.9451293945312, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.9236151603498542, "grad_norm": 0.11756055802106857, "learning_rate": 1e-06, "loss": 0.02, "num_tokens": 63292942.0, "reward": 0.504464328289032, "reward_std": 0.20422455668449402, "rewards/simpleverify_reward/mean": 0.5044642686843872, "rewards/simpleverify_reward/std": 0.5002593398094177, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3575.0, "completions/mean_length": 633.2410888671875, "completions/mean_terminated_length": 574.2838134765625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.9329446064139941, "grad_norm": 0.11402909457683563, "learning_rate": 1e-06, "loss": 0.0262, "num_tokens": 63950590.0, "reward": 0.5915178656578064, "reward_std": 0.1917753517627716, "rewards/simpleverify_reward/mean": 0.5915178656578064, "rewards/simpleverify_reward/std": 0.49182769656181335, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3152.0, "completions/mean_length": 605.6796875, "completions/mean_terminated_length": 558.2998046875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.9422740524781341, "grad_norm": 0.12598371505737305, "learning_rate": 1e-06, "loss": 0.0401, "num_tokens": 64578799.0, "reward": 0.5691964626312256, "reward_std": 0.21358031034469604, "rewards/simpleverify_reward/mean": 0.5691964030265808, "rewards/simpleverify_reward/std": 0.4954652488231659, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2644.0, "completions/mean_length": 596.6049194335938, "completions/mean_terminated_length": 569.0506591796875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.9516034985422741, "grad_norm": 0.130088672041893, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 65213269.0, "reward": 0.5636160969734192, "reward_std": 0.19787195324897766, "rewards/simpleverify_reward/mean": 0.5636160969734192, "rewards/simpleverify_reward/std": 0.49621346592903137, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0279017857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 712.013427734375, "completions/mean_terminated_length": 614.884033203125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.960932944606414, "grad_norm": 0.09980812668800354, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 65938697.0, "reward": 0.5256696939468384, "reward_std": 0.15984290838241577, "rewards/simpleverify_reward/mean": 0.5256696343421936, "rewards/simpleverify_reward/std": 0.4996195137500763, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2107.0, "completions/mean_length": 599.25, "completions/mean_terminated_length": 539.7139892578125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.970262390670554, "grad_norm": 0.13485193252563477, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 66568569.0, "reward": 0.5803571939468384, "reward_std": 0.23101434111595154, "rewards/simpleverify_reward/mean": 0.5803571343421936, "rewards/simpleverify_reward/std": 0.4937761127948761, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 4044.0, "completions/mean_length": 573.0045166015625, "completions/mean_terminated_length": 533.2415771484375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.9795918367346939, "grad_norm": 0.12294933944940567, "learning_rate": 1e-06, "loss": 0.0267, "num_tokens": 67169877.0, "reward": 0.5535714626312256, "reward_std": 0.20177684724330902, "rewards/simpleverify_reward/mean": 0.5535714030265808, "rewards/simpleverify_reward/std": 0.4973994791507721, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2450.0, "completions/mean_length": 584.4855346679688, "completions/mean_terminated_length": 552.8502197265625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.9889212827988338, "grad_norm": 0.1191057488322258, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 67775768.0, "reward": 0.559151828289032, "reward_std": 0.162400484085083, "rewards/simpleverify_reward/mean": 0.5591517686843872, "rewards/simpleverify_reward/std": 0.496766060590744, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014204545454545414, "completions/max_length": 4096.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 551.9005737304688, "completions/mean_terminated_length": 500.83282470703125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.9982507288629737, "grad_norm": 0.1294296830892563, "learning_rate": 1e-06, "loss": 0.0408, "num_tokens": 68398314.0, "reward": 0.5379464626312256, "reward_std": 0.21440376341342926, "rewards/simpleverify_reward/mean": 0.5379464030265808, "rewards/simpleverify_reward/std": 0.4988364577293396, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2352.0, "completions/mean_length": 580.169677734375, "completions/mean_terminated_length": 548.4954833984375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 1.00932944606414, "grad_norm": 0.13090579211711884, "learning_rate": 1e-06, "loss": 0.0362, "num_tokens": 69008570.0, "reward": 0.5602678656578064, "reward_std": 0.2296549528837204, "rewards/simpleverify_reward/mean": 0.5602678656578064, "rewards/simpleverify_reward/std": 0.4966317117214203, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 596.6517944335938, "completions/mean_terminated_length": 545.1325073242188, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 1.01865889212828, "grad_norm": 0.12535984814167023, "learning_rate": 1e-06, "loss": 0.0425, "num_tokens": 69639618.0, "reward": 0.5569196939468384, "reward_std": 0.212483748793602, "rewards/simpleverify_reward/mean": 0.5569196343421936, "rewards/simpleverify_reward/std": 0.4970270097255707, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2234.0, "completions/mean_length": 663.2645263671875, "completions/mean_terminated_length": 580.8788452148438, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 1.0279883381924197, "grad_norm": 0.11190024763345718, "learning_rate": 1e-06, "loss": 0.0439, "num_tokens": 70326903.0, "reward": 0.5256696939468384, "reward_std": 0.1898653209209442, "rewards/simpleverify_reward/mean": 0.5256696343421936, "rewards/simpleverify_reward/std": 0.4996195435523987, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2417.0, "completions/mean_length": 629.5167846679688, "completions/mean_terminated_length": 554.4161987304688, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 1.0373177842565597, "grad_norm": 0.11574259400367737, "learning_rate": 1e-06, "loss": 0.0218, "num_tokens": 70995326.0, "reward": 0.4776785969734192, "reward_std": 0.17112110555171967, "rewards/simpleverify_reward/mean": 0.4776785671710968, "rewards/simpleverify_reward/std": 0.4997805058956146, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 576.5100708007812, "completions/mean_terminated_length": 528.7341918945312, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 1.0466472303206997, "grad_norm": 0.13990332186222076, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 71606455.0, "reward": 0.6004464626312256, "reward_std": 0.22867925465106964, "rewards/simpleverify_reward/mean": 0.6004464030265808, "rewards/simpleverify_reward/std": 0.49008017778396606, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3491.0, "completions/mean_length": 613.6808471679688, "completions/mean_terminated_length": 574.3770141601562, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 1.0559766763848397, "grad_norm": 0.10941661894321442, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 72246945.0, "reward": 0.574776828289032, "reward_std": 0.1856580376625061, "rewards/simpleverify_reward/mean": 0.5747767686843872, "rewards/simpleverify_reward/std": 0.49465295672416687, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 632.0814819335938, "completions/mean_terminated_length": 581.0838012695312, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 1.0653061224489795, "grad_norm": 0.12353738397359848, "learning_rate": 1e-06, "loss": 0.0338, "num_tokens": 72924738.0, "reward": 0.5189732313156128, "reward_std": 0.1908770352602005, "rewards/simpleverify_reward/mean": 0.5189732313156128, "rewards/simpleverify_reward/std": 0.49991893768310547, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 585.1886596679688, "completions/mean_terminated_length": 549.56591796875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 1.0746355685131195, "grad_norm": 0.11737425625324249, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 73536979.0, "reward": 0.5602678656578064, "reward_std": 0.18475720286369324, "rewards/simpleverify_reward/mean": 0.5602678656578064, "rewards/simpleverify_reward/std": 0.4966317415237427, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3227.0, "completions/mean_length": 611.3270263671875, "completions/mean_terminated_length": 564.0238037109375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 1.0839650145772595, "grad_norm": 0.11239819973707199, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 74172568.0, "reward": 0.6272321939468384, "reward_std": 0.19189409911632538, "rewards/simpleverify_reward/mean": 0.6272321343421936, "rewards/simpleverify_reward/std": 0.4838111698627472, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 4051.0, "completions/mean_length": 591.083740234375, "completions/mean_terminated_length": 551.5248413085938, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 1.0932944606413995, "grad_norm": 0.14145003259181976, "learning_rate": 1e-06, "loss": 0.0337, "num_tokens": 74787299.0, "reward": 0.5546875, "reward_std": 0.24276971817016602, "rewards/simpleverify_reward/mean": 0.5546875, "rewards/simpleverify_reward/std": 0.4972778558731079, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3802.0, "completions/mean_length": 609.833740234375, "completions/mean_terminated_length": 570.4864501953125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 1.1026239067055394, "grad_norm": 0.11767745763063431, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 75408062.0, "reward": 0.6227678656578064, "reward_std": 0.20388300716876984, "rewards/simpleverify_reward/mean": 0.6227678656578064, "rewards/simpleverify_reward/std": 0.4849644899368286, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2186.0, "completions/mean_length": 574.7611694335938, "completions/mean_terminated_length": 522.9195556640625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 1.1119533527696792, "grad_norm": 0.12074735015630722, "learning_rate": 1e-06, "loss": 0.0292, "num_tokens": 75996672.0, "reward": 0.6595982313156128, "reward_std": 0.17266161739826202, "rewards/simpleverify_reward/mean": 0.6595982313156128, "rewards/simpleverify_reward/std": 0.4741089344024658, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2671.0, "completions/mean_length": 629.341552734375, "completions/mean_terminated_length": 570.31787109375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 1.1212827988338192, "grad_norm": 0.12106913328170776, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 76657202.0, "reward": 0.5837053656578064, "reward_std": 0.20373035967350006, "rewards/simpleverify_reward/mean": 0.5837053656578064, "rewards/simpleverify_reward/std": 0.49321895837783813, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3826.0, "completions/mean_length": 612.8527221679688, "completions/mean_terminated_length": 581.4729614257812, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 1.1306122448979592, "grad_norm": 0.1349433809518814, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 77303518.0, "reward": 0.551339328289032, "reward_std": 0.20587536692619324, "rewards/simpleverify_reward/mean": 0.5513392686843872, "rewards/simpleverify_reward/std": 0.4976350665092468, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3669.0, "completions/mean_length": 576.390625, "completions/mean_terminated_length": 552.6629028320312, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 1.1399416909620992, "grad_norm": 0.12937355041503906, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 77898532.0, "reward": 0.6194196939468384, "reward_std": 0.20632225275039673, "rewards/simpleverify_reward/mean": 0.6194196343421936, "rewards/simpleverify_reward/std": 0.48580074310302734, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3294.0, "completions/mean_length": 635.2533569335938, "completions/mean_terminated_length": 588.27490234375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 1.149271137026239, "grad_norm": 0.11539506912231445, "learning_rate": 1e-06, "loss": 0.0298, "num_tokens": 78557807.0, "reward": 0.5714285969734192, "reward_std": 0.21966342628002167, "rewards/simpleverify_reward/mean": 0.5714285969734192, "rewards/simpleverify_reward/std": 0.49514806270599365, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2348.0, "completions/mean_length": 588.6953125, "completions/mean_terminated_length": 561.0787353515625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 1.158600583090379, "grad_norm": 0.11508555710315704, "learning_rate": 1e-06, "loss": 0.0124, "num_tokens": 79174790.0, "reward": 0.606026828289032, "reward_std": 0.16668306291103363, "rewards/simpleverify_reward/mean": 0.6060267686843872, "rewards/simpleverify_reward/std": 0.48890194296836853, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3644.0, "completions/mean_length": 635.6529541015625, "completions/mean_terminated_length": 584.7078247070312, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 1.167930029154519, "grad_norm": 0.1126394271850586, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 79834679.0, "reward": 0.4933035969734192, "reward_std": 0.17626594007015228, "rewards/simpleverify_reward/mean": 0.4933035671710968, "rewards/simpleverify_reward/std": 0.5002344250679016, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 594.544677734375, "completions/mean_terminated_length": 563.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 1.177259475218659, "grad_norm": 0.11674212664365768, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 80455871.0, "reward": 0.6428571939468384, "reward_std": 0.1668703407049179, "rewards/simpleverify_reward/mean": 0.6428571343421936, "rewards/simpleverify_reward/std": 0.4794250428676605, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3019.0, "completions/mean_length": 608.7645263671875, "completions/mean_terminated_length": 561.426513671875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 1.186588921282799, "grad_norm": 0.12868697941303253, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 81083924.0, "reward": 0.6339285969734192, "reward_std": 0.18310751020908356, "rewards/simpleverify_reward/mean": 0.6339285969734192, "rewards/simpleverify_reward/std": 0.48199838399887085, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3870.0, "completions/mean_length": 617.6730346679688, "completions/mean_terminated_length": 574.4395751953125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 1.1959183673469387, "grad_norm": 0.1280255764722824, "learning_rate": 1e-06, "loss": 0.0339, "num_tokens": 81729279.0, "reward": 0.5558035969734192, "reward_std": 0.2603527903556824, "rewards/simpleverify_reward/mean": 0.5558035969734192, "rewards/simpleverify_reward/std": 0.49715369939804077, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2672.0, "completions/mean_length": 646.0435791015625, "completions/mean_terminated_length": 583.3170166015625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 1.2052478134110787, "grad_norm": 0.13409265875816345, "learning_rate": 1e-06, "loss": 0.0322, "num_tokens": 82401222.0, "reward": 0.5736607313156128, "reward_std": 0.23817941546440125, "rewards/simpleverify_reward/mean": 0.5736607313156128, "rewards/simpleverify_reward/std": 0.4948205351829529, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3942.0, "completions/mean_length": 679.302490234375, "completions/mean_terminated_length": 613.2229614257812, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 1.2145772594752187, "grad_norm": 0.123910091817379, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 83103021.0, "reward": 0.5033482313156128, "reward_std": 0.2286771684885025, "rewards/simpleverify_reward/mean": 0.5033482313156128, "rewards/simpleverify_reward/std": 0.5002680420875549, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3948.0, "completions/mean_length": 567.9877319335938, "completions/mean_terminated_length": 536.203857421875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 1.2239067055393587, "grad_norm": 0.1286127269268036, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 83697090.0, "reward": 0.6261160969734192, "reward_std": 0.19730813801288605, "rewards/simpleverify_reward/mean": 0.6261160969734192, "rewards/simpleverify_reward/std": 0.48410359025001526, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3453.0, "completions/mean_length": 565.1752319335938, "completions/mean_terminated_length": 537.3734741210938, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 1.2332361516034984, "grad_norm": 0.12047168612480164, "learning_rate": 1e-06, "loss": 0.0206, "num_tokens": 84288983.0, "reward": 0.6316964626312256, "reward_std": 0.16886410117149353, "rewards/simpleverify_reward/mean": 0.6316964030265808, "rewards/simpleverify_reward/std": 0.4826137125492096, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3825.0, "completions/mean_length": 595.9620971679688, "completions/mean_terminated_length": 548.4502563476562, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 1.2425655976676384, "grad_norm": 0.11441270262002945, "learning_rate": 1e-06, "loss": 0.0353, "num_tokens": 84913837.0, "reward": 0.5725446939468384, "reward_std": 0.17926928400993347, "rewards/simpleverify_reward/mean": 0.5725446343421936, "rewards/simpleverify_reward/std": 0.49498558044433594, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2181.0, "completions/mean_length": 582.3616333007812, "completions/mean_terminated_length": 550.7072143554688, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 1.2518950437317784, "grad_norm": 0.12612448632717133, "learning_rate": 1e-06, "loss": 0.0272, "num_tokens": 85516897.0, "reward": 0.6171875, "reward_std": 0.21797123551368713, "rewards/simpleverify_reward/mean": 0.6171875, "rewards/simpleverify_reward/std": 0.4863446056842804, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 594.7957763671875, "completions/mean_terminated_length": 535.1838989257812, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 1.2612244897959184, "grad_norm": 0.12937891483306885, "learning_rate": 1e-06, "loss": 0.0428, "num_tokens": 86133850.0, "reward": 0.6618303656578064, "reward_std": 0.1863730102777481, "rewards/simpleverify_reward/mean": 0.6618303656578064, "rewards/simpleverify_reward/std": 0.4733508229255676, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3740.0, "completions/mean_length": 608.3203125, "completions/mean_terminated_length": 540.8680419921875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 1.2705539358600584, "grad_norm": 0.12036091089248657, "learning_rate": 1e-06, "loss": 0.0467, "num_tokens": 86770505.0, "reward": 0.590401828289032, "reward_std": 0.17097207903862, "rewards/simpleverify_reward/mean": 0.5904017686843872, "rewards/simpleverify_reward/std": 0.49203425645828247, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3647.0, "completions/mean_length": 645.2444458007812, "completions/mean_terminated_length": 582.50341796875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 1.2798833819241984, "grad_norm": 0.11793738603591919, "learning_rate": 1e-06, "loss": 0.0208, "num_tokens": 87434868.0, "reward": 0.582589328289032, "reward_std": 0.1806650459766388, "rewards/simpleverify_reward/mean": 0.5825892686843872, "rewards/simpleverify_reward/std": 0.493407279253006, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 614.5982666015625, "completions/mean_terminated_length": 579.27392578125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 1.2892128279883381, "grad_norm": 0.12346206605434418, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 88077860.0, "reward": 0.5491071939468384, "reward_std": 0.19163590669631958, "rewards/simpleverify_reward/mean": 0.5491071343421936, "rewards/simpleverify_reward/std": 0.49786055088043213, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2660.0, "completions/mean_length": 633.6373291015625, "completions/mean_terminated_length": 594.5587158203125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 1.2985422740524781, "grad_norm": 0.12098722904920578, "learning_rate": 1e-06, "loss": 0.04, "num_tokens": 88741383.0, "reward": 0.5133928656578064, "reward_std": 0.22082598507404327, "rewards/simpleverify_reward/mean": 0.5133928656578064, "rewards/simpleverify_reward/std": 0.500099778175354, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3183.0, "completions/mean_length": 640.0881958007812, "completions/mean_terminated_length": 593.1753540039062, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 1.3078717201166181, "grad_norm": 0.11019234359264374, "learning_rate": 1e-06, "loss": 0.0258, "num_tokens": 89407958.0, "reward": 0.520089328289032, "reward_std": 0.16198793053627014, "rewards/simpleverify_reward/mean": 0.5200892686843872, "rewards/simpleverify_reward/std": 0.4998753070831299, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3021.0, "completions/mean_length": 688.1094360351562, "completions/mean_terminated_length": 610.3036499023438, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 1.3172011661807579, "grad_norm": 0.10937510430812836, "learning_rate": 1e-06, "loss": 0.0344, "num_tokens": 90118432.0, "reward": 0.512276828289032, "reward_std": 0.21289673447608948, "rewards/simpleverify_reward/mean": 0.5122767686843872, "rewards/simpleverify_reward/std": 0.500128448009491, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3366.0, "completions/mean_length": 639.0100708007812, "completions/mean_terminated_length": 599.9921264648438, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 1.3265306122448979, "grad_norm": 0.12337496131658554, "learning_rate": 1e-06, "loss": 0.0308, "num_tokens": 90784393.0, "reward": 0.5379464626312256, "reward_std": 0.19299136102199554, "rewards/simpleverify_reward/mean": 0.5379464030265808, "rewards/simpleverify_reward/std": 0.4988364577293396, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3856.0, "completions/mean_length": 661.7366333007812, "completions/mean_terminated_length": 587.3341064453125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 1.3358600583090379, "grad_norm": 0.12596161663532257, "learning_rate": 1e-06, "loss": 0.0421, "num_tokens": 91468877.0, "reward": 0.5412946939468384, "reward_std": 0.20888377726078033, "rewards/simpleverify_reward/mean": 0.5412946343421936, "rewards/simpleverify_reward/std": 0.49857014417648315, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3920.0, "completions/mean_length": 707.1406860351562, "completions/mean_terminated_length": 641.5995483398438, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.3451895043731779, "grad_norm": 0.11016078293323517, "learning_rate": 1e-06, "loss": 0.0487, "num_tokens": 92198691.0, "reward": 0.5558035969734192, "reward_std": 0.195918470621109, "rewards/simpleverify_reward/mean": 0.5558035969734192, "rewards/simpleverify_reward/std": 0.49715372920036316, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4036.0, "completions/mean_length": 616.5803833007812, "completions/mean_terminated_length": 569.3484497070312, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 1.3545189504373178, "grad_norm": 0.12824207544326782, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 92837803.0, "reward": 0.6127232313156128, "reward_std": 0.2011035531759262, "rewards/simpleverify_reward/mean": 0.6127232313156128, "rewards/simpleverify_reward/std": 0.4873998463153839, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2429.0, "completions/mean_length": 643.3426513671875, "completions/mean_terminated_length": 584.557373046875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 1.3638483965014578, "grad_norm": 0.12126347422599792, "learning_rate": 1e-06, "loss": 0.0251, "num_tokens": 93517622.0, "reward": 0.5558035969734192, "reward_std": 0.19779597222805023, "rewards/simpleverify_reward/mean": 0.5558035969734192, "rewards/simpleverify_reward/std": 0.49715372920036316, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0279017857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3516.0, "completions/mean_length": 708.4330444335938, "completions/mean_terminated_length": 611.200927734375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 1.3731778425655976, "grad_norm": 0.11342115700244904, "learning_rate": 1e-06, "loss": 0.031, "num_tokens": 94239986.0, "reward": 0.5412946939468384, "reward_std": 0.19114550948143005, "rewards/simpleverify_reward/mean": 0.5412946343421936, "rewards/simpleverify_reward/std": 0.49857014417648315, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3870.0, "completions/mean_length": 662.5692138671875, "completions/mean_terminated_length": 612.0203857421875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 1.3825072886297376, "grad_norm": 0.13013873994350433, "learning_rate": 1e-06, "loss": 0.0474, "num_tokens": 94917096.0, "reward": 0.5881696939468384, "reward_std": 0.23822499811649323, "rewards/simpleverify_reward/mean": 0.5881696343421936, "rewards/simpleverify_reward/std": 0.4924396276473999, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2850.0, "completions/mean_length": 604.1629638671875, "completions/mean_terminated_length": 556.762451171875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 1.3918367346938776, "grad_norm": 0.11226054280996323, "learning_rate": 1e-06, "loss": 0.0402, "num_tokens": 95543050.0, "reward": 0.59375, "reward_std": 0.17028626799583435, "rewards/simpleverify_reward/mean": 0.59375, "rewards/simpleverify_reward/std": 0.4914066195487976, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3634.0, "completions/mean_length": 587.6808471679688, "completions/mean_terminated_length": 575.894775390625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 1.4011661807580174, "grad_norm": 0.12148512154817581, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 96165868.0, "reward": 0.6205357313156128, "reward_std": 0.18501681089401245, "rewards/simpleverify_reward/mean": 0.6205357313156128, "rewards/simpleverify_reward/std": 0.4855247139930725, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 645.1004638671875, "completions/mean_terminated_length": 586.3450927734375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 1.4104956268221573, "grad_norm": 0.12838071584701538, "learning_rate": 1e-06, "loss": 0.0486, "num_tokens": 96841078.0, "reward": 0.5502232313156128, "reward_std": 0.22060804069042206, "rewards/simpleverify_reward/mean": 0.5502232313156128, "rewards/simpleverify_reward/std": 0.49774909019470215, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 1826.0, "completions/mean_length": 603.3717041015625, "completions/mean_terminated_length": 563.9514770507812, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 1.4198250728862973, "grad_norm": 0.12589551508426666, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 97466179.0, "reward": 0.6417410969734192, "reward_std": 0.20737352967262268, "rewards/simpleverify_reward/mean": 0.6417410969734192, "rewards/simpleverify_reward/std": 0.47975656390190125, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 2954.0, "completions/mean_length": 639.7924194335938, "completions/mean_terminated_length": 572.9487915039062, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 1.4291545189504373, "grad_norm": 0.1230233684182167, "learning_rate": 1e-06, "loss": 0.0253, "num_tokens": 98128657.0, "reward": 0.5703125, "reward_std": 0.20320719480514526, "rewards/simpleverify_reward/mean": 0.5703125, "rewards/simpleverify_reward/std": 0.49530795216560364, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 572.333740234375, "completions/mean_terminated_length": 552.56005859375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 1.4384839650145773, "grad_norm": 0.11683732271194458, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 98723996.0, "reward": 0.6439732313156128, "reward_std": 0.17186996340751648, "rewards/simpleverify_reward/mean": 0.6439732313156128, "rewards/simpleverify_reward/std": 0.47909069061279297, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3391.0, "completions/mean_length": 599.794677734375, "completions/mean_terminated_length": 536.2272338867188, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 1.4478134110787173, "grad_norm": 0.13002394139766693, "learning_rate": 1e-06, "loss": 0.0296, "num_tokens": 99345740.0, "reward": 0.6183035969734192, "reward_std": 0.20459797978401184, "rewards/simpleverify_reward/mean": 0.6183035969734192, "rewards/simpleverify_reward/std": 0.4860740303993225, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3436.0, "completions/mean_length": 589.0770263671875, "completions/mean_terminated_length": 561.4634399414062, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 1.457142857142857, "grad_norm": 0.13289058208465576, "learning_rate": 1e-06, "loss": 0.0297, "num_tokens": 99962857.0, "reward": 0.6026785969734192, "reward_std": 0.19204634428024292, "rewards/simpleverify_reward/mean": 0.6026785969734192, "rewards/simpleverify_reward/std": 0.48961687088012695, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3893.0, "completions/mean_length": 667.5714721679688, "completions/mean_terminated_length": 613.1519165039062, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 1.466472303206997, "grad_norm": 0.10600113123655319, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 100652505.0, "reward": 0.5412946939468384, "reward_std": 0.1645808219909668, "rewards/simpleverify_reward/mean": 0.5412946343421936, "rewards/simpleverify_reward/std": 0.49857014417648315, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3494.0, "completions/mean_length": 646.1640625, "completions/mean_terminated_length": 595.3737182617188, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.475801749271137, "grad_norm": 0.1211216002702713, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 101322324.0, "reward": 0.5691964626312256, "reward_std": 0.18235045671463013, "rewards/simpleverify_reward/mean": 0.5691964030265808, "rewards/simpleverify_reward/std": 0.4954652786254883, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3080.0, "completions/mean_length": 652.3861694335938, "completions/mean_terminated_length": 605.6403198242188, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 1.485131195335277, "grad_norm": 0.11210227757692337, "learning_rate": 1e-06, "loss": 0.0239, "num_tokens": 101985406.0, "reward": 0.5892857313156128, "reward_std": 0.18036192655563354, "rewards/simpleverify_reward/mean": 0.5892857313156128, "rewards/simpleverify_reward/std": 0.49223825335502625, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 4056.0, "completions/mean_length": 622.8538208007812, "completions/mean_terminated_length": 579.6847534179688, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 1.4944606413994168, "grad_norm": 0.12392577528953552, "learning_rate": 1e-06, "loss": 0.0316, "num_tokens": 102633587.0, "reward": 0.6205357313156128, "reward_std": 0.21435917913913727, "rewards/simpleverify_reward/mean": 0.6205357313156128, "rewards/simpleverify_reward/std": 0.4855247139930725, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3227.0, "completions/mean_length": 636.9910888671875, "completions/mean_terminated_length": 601.8939819335938, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 1.5037900874635568, "grad_norm": 0.12567861378192902, "learning_rate": 1e-06, "loss": 0.0253, "num_tokens": 103286723.0, "reward": 0.6026785969734192, "reward_std": 0.21702805161476135, "rewards/simpleverify_reward/mean": 0.6026785969734192, "rewards/simpleverify_reward/std": 0.48961687088012695, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 2773.0, "completions/mean_length": 675.4051513671875, "completions/mean_terminated_length": 609.250244140625, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 1.5131195335276968, "grad_norm": 0.1102680042386055, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 103987326.0, "reward": 0.551339328289032, "reward_std": 0.18791352212429047, "rewards/simpleverify_reward/mean": 0.5513392686843872, "rewards/simpleverify_reward/std": 0.4976350665092468, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4094.0, "completions/mean_length": 652.0692138671875, "completions/mean_terminated_length": 597.4036254882812, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 1.5224489795918368, "grad_norm": 0.12377143651247025, "learning_rate": 1e-06, "loss": 0.0352, "num_tokens": 104673196.0, "reward": 0.5368303656578064, "reward_std": 0.2131984382867813, "rewards/simpleverify_reward/mean": 0.5368303656578064, "rewards/simpleverify_reward/std": 0.49892017245292664, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 596.375, "completions/mean_terminated_length": 548.8687744140625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 1.5317784256559768, "grad_norm": 0.11980990320444107, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 105301036.0, "reward": 0.590401828289032, "reward_std": 0.16334158182144165, "rewards/simpleverify_reward/mean": 0.5904017686843872, "rewards/simpleverify_reward/std": 0.49203425645828247, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3324.0, "completions/mean_length": 660.6897583007812, "completions/mean_terminated_length": 617.990966796875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 1.5411078717201168, "grad_norm": 0.12633444368839264, "learning_rate": 1e-06, "loss": 0.0226, "num_tokens": 105984902.0, "reward": 0.559151828289032, "reward_std": 0.2150859236717224, "rewards/simpleverify_reward/mean": 0.5591517686843872, "rewards/simpleverify_reward/std": 0.496766060590744, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 613.734375, "completions/mean_terminated_length": 530.1599731445312, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 1.5504373177842565, "grad_norm": 0.1370096653699875, "learning_rate": 1e-06, "loss": 0.0266, "num_tokens": 106628368.0, "reward": 0.5535714626312256, "reward_std": 0.16465680301189423, "rewards/simpleverify_reward/mean": 0.5535714030265808, "rewards/simpleverify_reward/std": 0.4973995089530945, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 637.8426513671875, "completions/mean_terminated_length": 586.9297485351562, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 1.5597667638483965, "grad_norm": 0.12054604291915894, "learning_rate": 1e-06, "loss": 0.0282, "num_tokens": 107282779.0, "reward": 0.6049107313156128, "reward_std": 0.22228771448135376, "rewards/simpleverify_reward/mean": 0.6049107313156128, "rewards/simpleverify_reward/std": 0.48914292454719543, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3618.0, "completions/mean_length": 701.1205444335938, "completions/mean_terminated_length": 619.6434326171875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 1.5690962099125363, "grad_norm": 0.1102665513753891, "learning_rate": 1e-06, "loss": 0.0466, "num_tokens": 107995327.0, "reward": 0.5725446939468384, "reward_std": 0.18208763003349304, "rewards/simpleverify_reward/mean": 0.5725446343421936, "rewards/simpleverify_reward/std": 0.49498558044433594, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2800.0, "completions/mean_length": 621.7199096679688, "completions/mean_terminated_length": 586.4678344726562, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 1.5784256559766763, "grad_norm": 0.12137096375226974, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 108640188.0, "reward": 0.590401828289032, "reward_std": 0.19001756608486176, "rewards/simpleverify_reward/mean": 0.5904017686843872, "rewards/simpleverify_reward/std": 0.49203425645828247, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4005.0, "completions/mean_length": 651.5267944335938, "completions/mean_terminated_length": 620.4954833984375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 1.5877551020408163, "grad_norm": 0.12179001420736313, "learning_rate": 1e-06, "loss": 0.0406, "num_tokens": 109313124.0, "reward": 0.6127232313156128, "reward_std": 0.2114313542842865, "rewards/simpleverify_reward/mean": 0.6127232313156128, "rewards/simpleverify_reward/std": 0.4873998463153839, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3724.0, "completions/mean_length": 656.5100708007812, "completions/mean_terminated_length": 601.9149780273438, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 1.5970845481049563, "grad_norm": 0.12153714895248413, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 109992445.0, "reward": 0.5725446939468384, "reward_std": 0.19343434274196625, "rewards/simpleverify_reward/mean": 0.5725446343421936, "rewards/simpleverify_reward/std": 0.49498558044433594, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2748.0, "completions/mean_length": 657.9777221679688, "completions/mean_terminated_length": 630.9066772460938, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 1.6064139941690962, "grad_norm": 0.1343393325805664, "learning_rate": 1e-06, "loss": 0.0402, "num_tokens": 110673745.0, "reward": 0.598214328289032, "reward_std": 0.2757573425769806, "rewards/simpleverify_reward/mean": 0.5982142686843872, "rewards/simpleverify_reward/std": 0.49053290486335754, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3032.0, "completions/mean_length": 648.5111694335938, "completions/mean_terminated_length": 577.833740234375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.6157434402332362, "grad_norm": 0.12231628596782684, "learning_rate": 1e-06, "loss": 0.0309, "num_tokens": 111335259.0, "reward": 0.6238839626312256, "reward_std": 0.18960431218147278, "rewards/simpleverify_reward/mean": 0.6238839030265808, "rewards/simpleverify_reward/std": 0.48468026518821716, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3799.0, "completions/mean_length": 598.9832763671875, "completions/mean_terminated_length": 563.5005493164062, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 1.6250728862973762, "grad_norm": 0.1250123232603073, "learning_rate": 1e-06, "loss": 0.0187, "num_tokens": 111962260.0, "reward": 0.5814732313156128, "reward_std": 0.19517098367214203, "rewards/simpleverify_reward/mean": 0.5814732313156128, "rewards/simpleverify_reward/std": 0.4935929775238037, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3499.0, "completions/mean_length": 673.9442138671875, "completions/mean_terminated_length": 611.7249755859375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 1.634402332361516, "grad_norm": 0.11655768752098083, "learning_rate": 1e-06, "loss": 0.0426, "num_tokens": 112674482.0, "reward": 0.5524553656578064, "reward_std": 0.18532173335552216, "rewards/simpleverify_reward/mean": 0.5524553656578064, "rewards/simpleverify_reward/std": 0.49751853942871094, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2965.0, "completions/mean_length": 648.646240234375, "completions/mean_terminated_length": 613.6674194335938, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 1.643731778425656, "grad_norm": 0.12407558411359787, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 113345973.0, "reward": 0.6049107313156128, "reward_std": 0.21019534766674042, "rewards/simpleverify_reward/mean": 0.6049107313156128, "rewards/simpleverify_reward/std": 0.48914292454719543, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2769.0, "completions/mean_length": 627.96875, "completions/mean_terminated_length": 596.7252197265625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 1.6530612244897958, "grad_norm": 0.12596943974494934, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 113999249.0, "reward": 0.559151828289032, "reward_std": 0.1955086886882782, "rewards/simpleverify_reward/mean": 0.5591517686843872, "rewards/simpleverify_reward/std": 0.496766060590744, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 635.3660888671875, "completions/mean_terminated_length": 572.4454345703125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 1.6623906705539357, "grad_norm": 0.12926681339740753, "learning_rate": 1e-06, "loss": 0.0323, "num_tokens": 114651993.0, "reward": 0.6227678656578064, "reward_std": 0.213277205824852, "rewards/simpleverify_reward/mean": 0.6227678656578064, "rewards/simpleverify_reward/std": 0.4849644899368286, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 581.8069458007812, "completions/mean_terminated_length": 558.11572265625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 1.6717201166180757, "grad_norm": 0.12437640875577927, "learning_rate": 1e-06, "loss": 0.0273, "num_tokens": 115255476.0, "reward": 0.5848214626312256, "reward_std": 0.18840177357196808, "rewards/simpleverify_reward/mean": 0.5848214030265808, "rewards/simpleverify_reward/std": 0.49302801489830017, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3908.0, "completions/mean_length": 635.0491333007812, "completions/mean_terminated_length": 564.095703125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 1.6810495626822157, "grad_norm": 0.1275281459093094, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 115911384.0, "reward": 0.5970982313156128, "reward_std": 0.19989892840385437, "rewards/simpleverify_reward/mean": 0.5970982313156128, "rewards/simpleverify_reward/std": 0.49075525999069214, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 632.7053833007812, "completions/mean_terminated_length": 573.7389526367188, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 1.6903790087463557, "grad_norm": 0.12115955352783203, "learning_rate": 1e-06, "loss": 0.0377, "num_tokens": 116561136.0, "reward": 0.637276828289032, "reward_std": 0.18606850504875183, "rewards/simpleverify_reward/mean": 0.6372767686843872, "rewards/simpleverify_reward/std": 0.481054425239563, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3502.0, "completions/mean_length": 717.146240234375, "completions/mean_terminated_length": 667.40087890625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 1.6997084548104957, "grad_norm": 0.12867502868175507, "learning_rate": 1e-06, "loss": 0.0389, "num_tokens": 117308203.0, "reward": 0.5145089626312256, "reward_std": 0.20790556073188782, "rewards/simpleverify_reward/mean": 0.5145089030265808, "rewards/simpleverify_reward/std": 0.5000685453414917, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 552.0234375, "completions/mean_terminated_length": 520.095703125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.7090379008746357, "grad_norm": 0.1296267807483673, "learning_rate": 1e-06, "loss": 0.0223, "num_tokens": 117898592.0, "reward": 0.6439732313156128, "reward_std": 0.16559860110282898, "rewards/simpleverify_reward/mean": 0.6439732313156128, "rewards/simpleverify_reward/std": 0.47909072041511536, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 673.5859375, "completions/mean_terminated_length": 619.2619018554688, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 1.7183673469387755, "grad_norm": 0.11626194417476654, "learning_rate": 1e-06, "loss": 0.0236, "num_tokens": 118594973.0, "reward": 0.582589328289032, "reward_std": 0.1942266821861267, "rewards/simpleverify_reward/mean": 0.5825892686843872, "rewards/simpleverify_reward/std": 0.493407279253006, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 663.0535888671875, "completions/mean_terminated_length": 620.3842163085938, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 1.7276967930029155, "grad_norm": 0.1079307571053505, "learning_rate": 1e-06, "loss": 0.0394, "num_tokens": 119274853.0, "reward": 0.5613839626312256, "reward_std": 0.15135519206523895, "rewards/simpleverify_reward/mean": 0.5613839030265808, "rewards/simpleverify_reward/std": 0.496494859457016, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 629.1551513671875, "completions/mean_terminated_length": 597.9223022460938, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 1.7370262390670554, "grad_norm": 0.11852250248193741, "learning_rate": 1e-06, "loss": 0.0297, "num_tokens": 119930424.0, "reward": 0.5301339626312256, "reward_std": 0.19204454123973846, "rewards/simpleverify_reward/mean": 0.5301339030265808, "rewards/simpleverify_reward/std": 0.49936988949775696, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 572.6953125, "completions/mean_terminated_length": 540.953857421875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 1.7463556851311952, "grad_norm": 0.13314345479011536, "learning_rate": 1e-06, "loss": 0.0268, "num_tokens": 120531303.0, "reward": 0.6662946939468384, "reward_std": 0.18208763003349304, "rewards/simpleverify_reward/mean": 0.6662946343421936, "rewards/simpleverify_reward/std": 0.47179922461509705, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3375.0, "completions/mean_length": 682.4609375, "completions/mean_terminated_length": 616.4425048828125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 1.7556851311953352, "grad_norm": 0.12394275516271591, "learning_rate": 1e-06, "loss": 0.0435, "num_tokens": 121233804.0, "reward": 0.5837053656578064, "reward_std": 0.20268161594867706, "rewards/simpleverify_reward/mean": 0.5837053656578064, "rewards/simpleverify_reward/std": 0.49321892857551575, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3918.0, "completions/mean_length": 648.0089721679688, "completions/mean_terminated_length": 605.1525268554688, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 1.7650145772594752, "grad_norm": 0.130984365940094, "learning_rate": 1e-06, "loss": 0.0457, "num_tokens": 121911108.0, "reward": 0.6037946939468384, "reward_std": 0.21132118999958038, "rewards/simpleverify_reward/mean": 0.6037946343421936, "rewards/simpleverify_reward/std": 0.48938122391700745, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 4040.0, "completions/mean_length": 678.5881958007812, "completions/mean_terminated_length": 636.1118774414062, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 1.7743440233236152, "grad_norm": 0.1219349130988121, "learning_rate": 1e-06, "loss": 0.0246, "num_tokens": 122603907.0, "reward": 0.606026828289032, "reward_std": 0.20031191408634186, "rewards/simpleverify_reward/mean": 0.6060267686843872, "rewards/simpleverify_reward/std": 0.48890194296836853, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3839.0, "completions/mean_length": 626.5335083007812, "completions/mean_terminated_length": 559.4334106445312, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 1.7836734693877552, "grad_norm": 0.11624182760715485, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 123255633.0, "reward": 0.629464328289032, "reward_std": 0.14741073548793793, "rewards/simpleverify_reward/mean": 0.6294642686843872, "rewards/simpleverify_reward/std": 0.4832179844379425, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3087.0, "completions/mean_length": 631.1171875, "completions/mean_terminated_length": 580.1052856445312, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 1.7930029154518952, "grad_norm": 0.1269126534461975, "learning_rate": 1e-06, "loss": 0.0397, "num_tokens": 123904762.0, "reward": 0.590401828289032, "reward_std": 0.21496829390525818, "rewards/simpleverify_reward/mean": 0.5904017686843872, "rewards/simpleverify_reward/std": 0.49203425645828247, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3659.0, "completions/mean_length": 701.4676513671875, "completions/mean_terminated_length": 627.9258422851562, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.802332361516035, "grad_norm": 0.11939291656017303, "learning_rate": 1e-06, "loss": 0.0263, "num_tokens": 124625773.0, "reward": 0.5580357313156128, "reward_std": 0.18724173307418823, "rewards/simpleverify_reward/mean": 0.5580357313156128, "rewards/simpleverify_reward/std": 0.49689781665802, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3234.0, "completions/mean_length": 628.5960083007812, "completions/mean_terminated_length": 565.55224609375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 1.811661807580175, "grad_norm": 0.1248975321650505, "learning_rate": 1e-06, "loss": 0.0313, "num_tokens": 125274451.0, "reward": 0.5926339626312256, "reward_std": 0.18490734696388245, "rewards/simpleverify_reward/mean": 0.5926339030265808, "rewards/simpleverify_reward/std": 0.49161848425865173, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 596.2890625, "completions/mean_terminated_length": 572.6954956054688, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 1.820991253644315, "grad_norm": 0.12662646174430847, "learning_rate": 1e-06, "loss": 0.02, "num_tokens": 125893110.0, "reward": 0.5803571939468384, "reward_std": 0.20662352442741394, "rewards/simpleverify_reward/mean": 0.5803571343421936, "rewards/simpleverify_reward/std": 0.4937761127948761, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3674.0, "completions/mean_length": 652.3381958007812, "completions/mean_terminated_length": 605.5916748046875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 1.8303206997084547, "grad_norm": 0.10841257125139236, "learning_rate": 1e-06, "loss": 0.0372, "num_tokens": 126560477.0, "reward": 0.6171875, "reward_std": 0.16360372304916382, "rewards/simpleverify_reward/mean": 0.6171875, "rewards/simpleverify_reward/std": 0.4863446056842804, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 4046.0, "completions/mean_length": 719.6004638671875, "completions/mean_terminated_length": 658.2113647460938, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 1.8396501457725947, "grad_norm": 0.12846212089061737, "learning_rate": 1e-06, "loss": 0.0244, "num_tokens": 127306047.0, "reward": 0.4430803656578064, "reward_std": 0.2329293042421341, "rewards/simpleverify_reward/mean": 0.4430803656578064, "rewards/simpleverify_reward/std": 0.49702703952789307, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3945.0, "completions/mean_length": 632.247802734375, "completions/mean_terminated_length": 577.267578125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 1.8489795918367347, "grad_norm": 0.13074922561645508, "learning_rate": 1e-06, "loss": 0.0331, "num_tokens": 127968077.0, "reward": 0.535714328289032, "reward_std": 0.19340182840824127, "rewards/simpleverify_reward/mean": 0.5357142686843872, "rewards/simpleverify_reward/std": 0.4990014135837555, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033482142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 687.6283569335938, "completions/mean_terminated_length": 569.555419921875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 1.8583090379008746, "grad_norm": 0.11673584580421448, "learning_rate": 1e-06, "loss": 0.0378, "num_tokens": 128664760.0, "reward": 0.5625, "reward_std": 0.1899441033601761, "rewards/simpleverify_reward/mean": 0.5625, "rewards/simpleverify_reward/std": 0.49635544419288635, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2581.0, "completions/mean_length": 636.8761596679688, "completions/mean_terminated_length": 593.88134765625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 1.8676384839650146, "grad_norm": 0.12287336587905884, "learning_rate": 1e-06, "loss": 0.0206, "num_tokens": 129326793.0, "reward": 0.6439732313156128, "reward_std": 0.1870569884777069, "rewards/simpleverify_reward/mean": 0.6439732313156128, "rewards/simpleverify_reward/std": 0.47909072041511536, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2448.0, "completions/mean_length": 634.966552734375, "completions/mean_terminated_length": 576.0386352539062, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 1.8769679300291546, "grad_norm": 0.13165536522865295, "learning_rate": 1e-06, "loss": 0.0493, "num_tokens": 129976339.0, "reward": 0.5970982313156128, "reward_std": 0.2148904949426651, "rewards/simpleverify_reward/mean": 0.5970982313156128, "rewards/simpleverify_reward/std": 0.4907552897930145, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3637.0, "completions/mean_length": 672.6194458007812, "completions/mean_terminated_length": 630.0689697265625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 1.8862973760932946, "grad_norm": 0.12439845502376556, "learning_rate": 1e-06, "loss": 0.0276, "num_tokens": 130659086.0, "reward": 0.5703125, "reward_std": 0.23120543360710144, "rewards/simpleverify_reward/mean": 0.5703125, "rewards/simpleverify_reward/std": 0.49530795216560364, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4067.0, "completions/mean_length": 674.083740234375, "completions/mean_terminated_length": 607.9032592773438, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 1.8956268221574344, "grad_norm": 0.13624821603298187, "learning_rate": 1e-06, "loss": 0.0523, "num_tokens": 131354857.0, "reward": 0.5580357313156128, "reward_std": 0.2348058521747589, "rewards/simpleverify_reward/mean": 0.5580357313156128, "rewards/simpleverify_reward/std": 0.49689781665802, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3153.0, "completions/mean_length": 662.396240234375, "completions/mean_terminated_length": 619.7186279296875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 1.9049562682215744, "grad_norm": 0.12502586841583252, "learning_rate": 1e-06, "loss": 0.0276, "num_tokens": 132045316.0, "reward": 0.5915178656578064, "reward_std": 0.19790726900100708, "rewards/simpleverify_reward/mean": 0.5915178656578064, "rewards/simpleverify_reward/std": 0.49182769656181335, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 640.0904541015625, "completions/mean_terminated_length": 605.0247802734375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 1.9142857142857141, "grad_norm": 0.1272956132888794, "learning_rate": 1e-06, "loss": 0.0363, "num_tokens": 132704653.0, "reward": 0.559151828289032, "reward_std": 0.20883916318416595, "rewards/simpleverify_reward/mean": 0.5591517686843872, "rewards/simpleverify_reward/std": 0.496766060590744, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4027.0, "completions/mean_length": 705.0123291015625, "completions/mean_terminated_length": 651.1870727539062, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 1.9236151603498541, "grad_norm": 0.11879491060972214, "learning_rate": 1e-06, "loss": 0.0434, "num_tokens": 133415968.0, "reward": 0.5636160969734192, "reward_std": 0.20883919298648834, "rewards/simpleverify_reward/mean": 0.5636160969734192, "rewards/simpleverify_reward/std": 0.49621346592903137, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3698.0, "completions/mean_length": 635.0245971679688, "completions/mean_terminated_length": 592.0067749023438, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 1.9329446064139941, "grad_norm": 0.13212226331233978, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 134078398.0, "reward": 0.606026828289032, "reward_std": 0.1804393082857132, "rewards/simpleverify_reward/mean": 0.6060267686843872, "rewards/simpleverify_reward/std": 0.48890194296836853, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3839.0, "completions/mean_length": 700.8192138671875, "completions/mean_terminated_length": 646.9274291992188, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.9422740524781341, "grad_norm": 0.12568719685077667, "learning_rate": 1e-06, "loss": 0.034, "num_tokens": 134797324.0, "reward": 0.5323660969734192, "reward_std": 0.21371906995773315, "rewards/simpleverify_reward/mean": 0.5323660969734192, "rewards/simpleverify_reward/std": 0.4992299973964691, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3981.0, "completions/mean_length": 574.4598388671875, "completions/mean_terminated_length": 542.7342529296875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 1.951603498542274, "grad_norm": 0.14370067417621613, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 135397264.0, "reward": 0.6618303656578064, "reward_std": 0.20910130441188812, "rewards/simpleverify_reward/mean": 0.6618303656578064, "rewards/simpleverify_reward/std": 0.4733508229255676, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3791.0, "completions/mean_length": 606.9620971679688, "completions/mean_terminated_length": 579.4893188476562, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 1.960932944606414, "grad_norm": 0.130110964179039, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 136023726.0, "reward": 0.6517857313156128, "reward_std": 0.1907753050327301, "rewards/simpleverify_reward/mean": 0.6517857313156128, "rewards/simpleverify_reward/std": 0.47667041420936584, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2091.0, "completions/mean_length": 613.4598388671875, "completions/mean_terminated_length": 597.8430786132812, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 1.970262390670554, "grad_norm": 0.13673153519630432, "learning_rate": 1e-06, "loss": 0.0222, "num_tokens": 136663306.0, "reward": 0.6160714626312256, "reward_std": 0.23431983590126038, "rewards/simpleverify_reward/mean": 0.6160714030265808, "rewards/simpleverify_reward/std": 0.486612468957901, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2585.0, "completions/mean_length": 585.4765625, "completions/mean_terminated_length": 537.8224487304688, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 1.9795918367346939, "grad_norm": 0.12226400524377823, "learning_rate": 1e-06, "loss": 0.0273, "num_tokens": 137272429.0, "reward": 0.6026785969734192, "reward_std": 0.19005149602890015, "rewards/simpleverify_reward/mean": 0.6026785969734192, "rewards/simpleverify_reward/std": 0.48961687088012695, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2661.0, "completions/mean_length": 583.6920166015625, "completions/mean_terminated_length": 544.0496826171875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 1.9889212827988338, "grad_norm": 0.12010557949542999, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 137888169.0, "reward": 0.6227678656578064, "reward_std": 0.15755170583724976, "rewards/simpleverify_reward/mean": 0.6227678656578064, "rewards/simpleverify_reward/std": 0.4849644899368286, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005681818181818232, "completions/max_length": 4096.0, "completions/max_terminated_length": 3313.0, "completions/mean_length": 616.9346923828125, "completions/mean_terminated_length": 597.0542602539062, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 1.9982507288629736, "grad_norm": 0.12478846311569214, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 138543870.0, "reward": 0.6462053656578064, "reward_std": 0.16683532297611237, "rewards/simpleverify_reward/mean": 0.6462053656578064, "rewards/simpleverify_reward/std": 0.478413462638855, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 637.1964721679688, "completions/mean_terminated_length": 590.244384765625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 2.00932944606414, "grad_norm": 0.1259663850069046, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 139203286.0, "reward": 0.6015625, "reward_std": 0.21413344144821167, "rewards/simpleverify_reward/mean": 0.6015625, "rewards/simpleverify_reward/std": 0.48984986543655396, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 651.3460083007812, "completions/mean_terminated_length": 588.7158813476562, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 2.01865889212828, "grad_norm": 0.12478436529636383, "learning_rate": 1e-06, "loss": 0.0217, "num_tokens": 139876572.0, "reward": 0.5736607313156128, "reward_std": 0.19103604555130005, "rewards/simpleverify_reward/mean": 0.5736607313156128, "rewards/simpleverify_reward/std": 0.4948205351829529, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4045.0, "completions/mean_length": 642.6484375, "completions/mean_terminated_length": 575.8600463867188, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 2.02798833819242, "grad_norm": 0.13296085596084595, "learning_rate": 1e-06, "loss": 0.0436, "num_tokens": 140546081.0, "reward": 0.5602678656578064, "reward_std": 0.20095199346542358, "rewards/simpleverify_reward/mean": 0.5602678656578064, "rewards/simpleverify_reward/std": 0.4966317117214203, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3844.0, "completions/mean_length": 694.5926513671875, "completions/mean_terminated_length": 636.679931640625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 2.03731778425656, "grad_norm": 0.11701364070177078, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 141259748.0, "reward": 0.5535714626312256, "reward_std": 0.19903381168842316, "rewards/simpleverify_reward/mean": 0.5535714030265808, "rewards/simpleverify_reward/std": 0.4973994791507721, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3729.0, "completions/mean_length": 577.9408569335938, "completions/mean_terminated_length": 542.24462890625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 2.0466472303206995, "grad_norm": 0.15028582513332367, "learning_rate": 1e-06, "loss": 0.0394, "num_tokens": 141864927.0, "reward": 0.668526828289032, "reward_std": 0.20606404542922974, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3971.0, "completions/mean_length": 681.5982666015625, "completions/mean_terminated_length": 619.5181884765625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 2.0559766763848395, "grad_norm": 0.11933286488056183, "learning_rate": 1e-06, "loss": 0.0463, "num_tokens": 142570175.0, "reward": 0.551339328289032, "reward_std": 0.21579021215438843, "rewards/simpleverify_reward/mean": 0.5513392686843872, "rewards/simpleverify_reward/std": 0.4976350665092468, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3094.0, "completions/mean_length": 626.0324096679688, "completions/mean_terminated_length": 586.8679809570312, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 2.0653061224489795, "grad_norm": 0.12289385497570038, "learning_rate": 1e-06, "loss": 0.0276, "num_tokens": 143219012.0, "reward": 0.6316964626312256, "reward_std": 0.17900897562503815, "rewards/simpleverify_reward/mean": 0.6316964030265808, "rewards/simpleverify_reward/std": 0.4826137125492096, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3509.0, "completions/mean_length": 655.546875, "completions/mean_terminated_length": 604.8946533203125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 2.0746355685131195, "grad_norm": 0.11916185915470123, "learning_rate": 1e-06, "loss": 0.0345, "num_tokens": 143898030.0, "reward": 0.6171875, "reward_std": 0.1733328253030777, "rewards/simpleverify_reward/mean": 0.6171875, "rewards/simpleverify_reward/std": 0.4863446056842804, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 587.4296875, "completions/mean_terminated_length": 555.8209838867188, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 2.0839650145772595, "grad_norm": 0.14087773859500885, "learning_rate": 1e-06, "loss": 0.0357, "num_tokens": 144525487.0, "reward": 0.6551339626312256, "reward_std": 0.19125428795814514, "rewards/simpleverify_reward/mean": 0.6551339030265808, "rewards/simpleverify_reward/std": 0.4755900800228119, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2711.0, "completions/mean_length": 641.6060791015625, "completions/mean_terminated_length": 598.6700439453125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 2.0932944606413995, "grad_norm": 0.1255234330892563, "learning_rate": 1e-06, "loss": 0.029, "num_tokens": 145186070.0, "reward": 0.6484375, "reward_std": 0.1918615996837616, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3945.0, "completions/mean_length": 673.7879638671875, "completions/mean_terminated_length": 631.2520141601562, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 2.1026239067055394, "grad_norm": 0.12595200538635254, "learning_rate": 1e-06, "loss": 0.047, "num_tokens": 145888192.0, "reward": 0.5267857313156128, "reward_std": 0.21369768679141998, "rewards/simpleverify_reward/mean": 0.5267857313156128, "rewards/simpleverify_reward/std": 0.4995608627796173, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3288.0, "completions/mean_length": 705.2199096679688, "completions/mean_terminated_length": 635.7050170898438, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 2.1119533527696794, "grad_norm": 0.11300618201494217, "learning_rate": 1e-06, "loss": 0.0384, "num_tokens": 146611917.0, "reward": 0.6049107313156128, "reward_std": 0.17803187668323517, "rewards/simpleverify_reward/mean": 0.6049107313156128, "rewards/simpleverify_reward/std": 0.48914292454719543, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3912.0, "completions/mean_length": 621.6105346679688, "completions/mean_terminated_length": 578.426025390625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 2.1212827988338194, "grad_norm": 0.1328049749135971, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 147251632.0, "reward": 0.6272321939468384, "reward_std": 0.21871984004974365, "rewards/simpleverify_reward/mean": 0.6272321343421936, "rewards/simpleverify_reward/std": 0.4838111698627472, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4053.0, "completions/mean_length": 649.9285888671875, "completions/mean_terminated_length": 595.22900390625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 2.130612244897959, "grad_norm": 0.13864803314208984, "learning_rate": 1e-06, "loss": 0.0412, "num_tokens": 147922376.0, "reward": 0.6183035969734192, "reward_std": 0.19730602204799652, "rewards/simpleverify_reward/mean": 0.6183035969734192, "rewards/simpleverify_reward/std": 0.4860740303993225, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3432.0, "completions/mean_length": 709.2467041015625, "completions/mean_terminated_length": 651.5834350585938, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 2.139941690962099, "grad_norm": 0.11451639980077744, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 148652797.0, "reward": 0.5714285969734192, "reward_std": 0.179875910282135, "rewards/simpleverify_reward/mean": 0.5714285969734192, "rewards/simpleverify_reward/std": 0.49514803290367126, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 596.8683471679688, "completions/mean_terminated_length": 569.3161010742188, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 2.149271137026239, "grad_norm": 0.13307060301303864, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 149273183.0, "reward": 0.6305803656578064, "reward_std": 0.18986672163009644, "rewards/simpleverify_reward/mean": 0.6305803656578064, "rewards/simpleverify_reward/std": 0.48291724920272827, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3196.0, "completions/mean_length": 640.474365234375, "completions/mean_terminated_length": 577.6465454101562, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 2.158600583090379, "grad_norm": 0.12350767105817795, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 149936784.0, "reward": 0.582589328289032, "reward_std": 0.18960639834403992, "rewards/simpleverify_reward/mean": 0.5825892686843872, "rewards/simpleverify_reward/std": 0.4934072494506836, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0279017857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3208.0, "completions/mean_length": 706.5625610351562, "completions/mean_terminated_length": 609.2766723632812, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 2.167930029154519, "grad_norm": 0.10991515219211578, "learning_rate": 1e-06, "loss": 0.0459, "num_tokens": 150660240.0, "reward": 0.543526828289032, "reward_std": 0.1690923273563385, "rewards/simpleverify_reward/mean": 0.5435267686843872, "rewards/simpleverify_reward/std": 0.49838003516197205, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2840.0, "completions/mean_length": 670.2589721679688, "completions/mean_terminated_length": 623.7556762695312, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 2.177259475218659, "grad_norm": 0.1287374049425125, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 151352056.0, "reward": 0.582589328289032, "reward_std": 0.21103018522262573, "rewards/simpleverify_reward/mean": 0.5825892686843872, "rewards/simpleverify_reward/std": 0.493407279253006, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3923.0, "completions/mean_length": 597.6004638671875, "completions/mean_terminated_length": 566.0833740234375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 2.186588921282799, "grad_norm": 0.14673568308353424, "learning_rate": 1e-06, "loss": 0.023, "num_tokens": 151994770.0, "reward": 0.5558035969734192, "reward_std": 0.2250000685453415, "rewards/simpleverify_reward/mean": 0.5558035969734192, "rewards/simpleverify_reward/std": 0.49715372920036316, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2852.0, "completions/mean_length": 575.4207763671875, "completions/mean_terminated_length": 543.703857421875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 2.195918367346939, "grad_norm": 0.12875676155090332, "learning_rate": 1e-06, "loss": 0.0612, "num_tokens": 152597643.0, "reward": 0.6283482313156128, "reward_std": 0.19584247469902039, "rewards/simpleverify_reward/mean": 0.6283482313156128, "rewards/simpleverify_reward/std": 0.4835159182548523, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2634.0, "completions/mean_length": 607.3449096679688, "completions/mean_terminated_length": 559.9876098632812, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 2.205247813411079, "grad_norm": 0.12630048394203186, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 153231256.0, "reward": 0.6127232313156128, "reward_std": 0.1695067137479782, "rewards/simpleverify_reward/mean": 0.6127232313156128, "rewards/simpleverify_reward/std": 0.4873998463153839, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 607.0267944335938, "completions/mean_terminated_length": 563.6610107421875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 2.2145772594752184, "grad_norm": 0.14437443017959595, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 153858560.0, "reward": 0.5524553656578064, "reward_std": 0.23751750588417053, "rewards/simpleverify_reward/mean": 0.5524553656578064, "rewards/simpleverify_reward/std": 0.49751850962638855, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3620.0, "completions/mean_length": 629.3717041015625, "completions/mean_terminated_length": 586.2836303710938, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 2.2239067055393584, "grad_norm": 0.10778579115867615, "learning_rate": 1e-06, "loss": 0.0347, "num_tokens": 154504213.0, "reward": 0.5792410969734192, "reward_std": 0.15070441365242004, "rewards/simpleverify_reward/mean": 0.5792410969734192, "rewards/simpleverify_reward/std": 0.49395665526390076, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3408.0, "completions/mean_length": 602.7232666015625, "completions/mean_terminated_length": 559.303955078125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 2.2332361516034984, "grad_norm": 0.13730835914611816, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 155135445.0, "reward": 0.6305803656578064, "reward_std": 0.16578657925128937, "rewards/simpleverify_reward/mean": 0.6305803656578064, "rewards/simpleverify_reward/std": 0.4829172194004059, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3697.0, "completions/mean_length": 612.7377319335938, "completions/mean_terminated_length": 565.4536743164062, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 2.2425655976676384, "grad_norm": 0.1211320087313652, "learning_rate": 1e-06, "loss": 0.0244, "num_tokens": 155759602.0, "reward": 0.6718750596046448, "reward_std": 0.17333491146564484, "rewards/simpleverify_reward/mean": 0.671875, "rewards/simpleverify_reward/std": 0.46979284286499023, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 616.5535888671875, "completions/mean_terminated_length": 581.2491455078125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 2.2518950437317784, "grad_norm": 0.12578250467777252, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 156400514.0, "reward": 0.6127232313156128, "reward_std": 0.17081506550312042, "rewards/simpleverify_reward/mean": 0.6127232313156128, "rewards/simpleverify_reward/std": 0.4873998463153839, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3274.0, "completions/mean_length": 602.8147583007812, "completions/mean_terminated_length": 575.3093872070312, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 2.2612244897959184, "grad_norm": 0.1416006237268448, "learning_rate": 1e-06, "loss": 0.0249, "num_tokens": 157020604.0, "reward": 0.6718750596046448, "reward_std": 0.2068856805562973, "rewards/simpleverify_reward/mean": 0.671875, "rewards/simpleverify_reward/std": 0.46979284286499023, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3411.0, "completions/mean_length": 614.4230346679688, "completions/mean_terminated_length": 571.149169921875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 2.2705539358600584, "grad_norm": 0.14788463711738586, "learning_rate": 1e-06, "loss": 0.0465, "num_tokens": 157656783.0, "reward": 0.5837053656578064, "reward_std": 0.21947231888771057, "rewards/simpleverify_reward/mean": 0.5837053656578064, "rewards/simpleverify_reward/std": 0.49321892857551575, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3836.0, "completions/mean_length": 586.3426513671875, "completions/mean_terminated_length": 546.7302856445312, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 2.2798833819241984, "grad_norm": 0.13385704159736633, "learning_rate": 1e-06, "loss": 0.0313, "num_tokens": 158259210.0, "reward": 0.6584821939468384, "reward_std": 0.15394920110702515, "rewards/simpleverify_reward/mean": 0.6584821343421936, "rewards/simpleverify_reward/std": 0.4744836091995239, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 639.8616333007812, "completions/mean_terminated_length": 588.9784545898438, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 2.2892128279883384, "grad_norm": 0.11635670065879822, "learning_rate": 1e-06, "loss": 0.0224, "num_tokens": 158915782.0, "reward": 0.6082589626312256, "reward_std": 0.16209739446640015, "rewards/simpleverify_reward/mean": 0.6082589030265808, "rewards/simpleverify_reward/std": 0.4884119927883148, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2447.0, "completions/mean_length": 644.1439819335938, "completions/mean_terminated_length": 597.2862548828125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 2.298542274052478, "grad_norm": 0.13403303921222687, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 159585927.0, "reward": 0.5613839626312256, "reward_std": 0.22890497744083405, "rewards/simpleverify_reward/mean": 0.5613839030265808, "rewards/simpleverify_reward/std": 0.496494859457016, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 638.6428833007812, "completions/mean_terminated_length": 607.4954833984375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 2.307871720116618, "grad_norm": 0.1213318333029747, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 160253839.0, "reward": 0.5770089626312256, "reward_std": 0.1682240068912506, "rewards/simpleverify_reward/mean": 0.5770089030265808, "rewards/simpleverify_reward/std": 0.4943099319934845, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3439.0, "completions/mean_length": 610.7511596679688, "completions/mean_terminated_length": 575.3878173828125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 2.317201166180758, "grad_norm": 0.14143887162208557, "learning_rate": 1e-06, "loss": 0.0176, "num_tokens": 160892072.0, "reward": 0.5926339626312256, "reward_std": 0.22300702333450317, "rewards/simpleverify_reward/mean": 0.5926339030265808, "rewards/simpleverify_reward/std": 0.49161845445632935, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3667.0, "completions/mean_length": 656.1484375, "completions/mean_terminated_length": 597.5811767578125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 2.326530612244898, "grad_norm": 0.1221829354763031, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 161570637.0, "reward": 0.6149553656578064, "reward_std": 0.17949682474136353, "rewards/simpleverify_reward/mean": 0.6149553656578064, "rewards/simpleverify_reward/std": 0.4868776500225067, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3862.0, "completions/mean_length": 639.513427734375, "completions/mean_terminated_length": 580.6629028320312, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 2.335860058309038, "grad_norm": 0.1328294426202774, "learning_rate": 1e-06, "loss": 0.0419, "num_tokens": 162226705.0, "reward": 0.6785714626312256, "reward_std": 0.19618059694766998, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3824.0, "completions/mean_length": 603.4542846679688, "completions/mean_terminated_length": 564.0349731445312, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 2.345189504373178, "grad_norm": 0.14110232889652252, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 162859240.0, "reward": 0.566964328289032, "reward_std": 0.20711320638656616, "rewards/simpleverify_reward/mean": 0.5669642686843872, "rewards/simpleverify_reward/std": 0.49577224254608154, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 619.9464721679688, "completions/mean_terminated_length": 560.7628173828125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 2.354518950437318, "grad_norm": 0.12641000747680664, "learning_rate": 1e-06, "loss": 0.0098, "num_tokens": 163500040.0, "reward": 0.6160714626312256, "reward_std": 0.1681826263666153, "rewards/simpleverify_reward/mean": 0.6160714030265808, "rewards/simpleverify_reward/std": 0.486612468957901, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3672.0, "completions/mean_length": 621.161865234375, "completions/mean_terminated_length": 585.9041137695312, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 2.363848396501458, "grad_norm": 0.11157050728797913, "learning_rate": 1e-06, "loss": 0.0217, "num_tokens": 164141737.0, "reward": 0.6629464626312256, "reward_std": 0.14350152015686035, "rewards/simpleverify_reward/mean": 0.6629464030265808, "rewards/simpleverify_reward/std": 0.47296738624572754, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3466.0, "completions/mean_length": 683.6517944335938, "completions/mean_terminated_length": 637.330322265625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 2.373177842565598, "grad_norm": 0.12643876671791077, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 164836745.0, "reward": 0.5457589626312256, "reward_std": 0.19576691091060638, "rewards/simpleverify_reward/mean": 0.5457589030265808, "rewards/simpleverify_reward/std": 0.4981797933578491, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2183.0, "completions/mean_length": 601.7467041015625, "completions/mean_terminated_length": 562.3081665039062, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 2.3825072886297374, "grad_norm": 0.12824338674545288, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 165460942.0, "reward": 0.65625, "reward_std": 0.16784630715847015, "rewards/simpleverify_reward/mean": 0.65625, "rewards/simpleverify_reward/std": 0.4752241373062134, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3992.0, "completions/mean_length": 662.1417846679688, "completions/mean_terminated_length": 631.2061157226562, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 2.3918367346938774, "grad_norm": 0.11737854778766632, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 166147917.0, "reward": 0.5770089626312256, "reward_std": 0.1796155869960785, "rewards/simpleverify_reward/mean": 0.5770089030265808, "rewards/simpleverify_reward/std": 0.4943099319934845, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2667.0, "completions/mean_length": 639.7467041015625, "completions/mean_terminated_length": 604.6775512695312, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 2.4011661807580174, "grad_norm": 0.12102331221103668, "learning_rate": 1e-06, "loss": 0.0207, "num_tokens": 166809642.0, "reward": 0.59375, "reward_std": 0.17911775410175323, "rewards/simpleverify_reward/mean": 0.59375, "rewards/simpleverify_reward/std": 0.4914066195487976, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 583.171875, "completions/mean_terminated_length": 563.4590454101562, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 2.4104956268221573, "grad_norm": 0.14405018091201782, "learning_rate": 1e-06, "loss": 0.035, "num_tokens": 167417068.0, "reward": 0.6316964626312256, "reward_std": 0.21444471180438995, "rewards/simpleverify_reward/mean": 0.6316964030265808, "rewards/simpleverify_reward/std": 0.4826137125492096, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3051.0, "completions/mean_length": 591.0491333007812, "completions/mean_terminated_length": 571.3804931640625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 2.4198250728862973, "grad_norm": 0.13323251903057098, "learning_rate": 1e-06, "loss": 0.02, "num_tokens": 168038184.0, "reward": 0.652901828289032, "reward_std": 0.18724244832992554, "rewards/simpleverify_reward/mean": 0.6529017686843872, "rewards/simpleverify_reward/std": 0.47631320357322693, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3498.0, "completions/mean_length": 630.9464721679688, "completions/mean_terminated_length": 591.8374633789062, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 2.4291545189504373, "grad_norm": 0.13015353679656982, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 168695144.0, "reward": 0.5714285969734192, "reward_std": 0.21106116473674774, "rewards/simpleverify_reward/mean": 0.5714285969734192, "rewards/simpleverify_reward/std": 0.49514803290367126, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2262.0, "completions/mean_length": 626.9486694335938, "completions/mean_terminated_length": 587.7946166992188, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 2.4384839650145773, "grad_norm": 0.12193893641233444, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 169352170.0, "reward": 0.6261160969734192, "reward_std": 0.20215481519699097, "rewards/simpleverify_reward/mean": 0.6261160969734192, "rewards/simpleverify_reward/std": 0.48410359025001526, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3226.0, "completions/mean_length": 637.1361694335938, "completions/mean_terminated_length": 578.2451782226562, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 2.4478134110787173, "grad_norm": 0.12688416242599487, "learning_rate": 1e-06, "loss": 0.0263, "num_tokens": 170011884.0, "reward": 0.5948660969734192, "reward_std": 0.19693085551261902, "rewards/simpleverify_reward/mean": 0.5948660969734192, "rewards/simpleverify_reward/std": 0.49119213223457336, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 641.622802734375, "completions/mean_terminated_length": 582.8082275390625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 2.4571428571428573, "grad_norm": 0.11415253579616547, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 170671586.0, "reward": 0.6116071939468384, "reward_std": 0.1731840819120407, "rewards/simpleverify_reward/mean": 0.6116071343421936, "rewards/simpleverify_reward/std": 0.4876568913459778, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2379.0, "completions/mean_length": 621.9921875, "completions/mean_terminated_length": 582.7821655273438, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 2.466472303206997, "grad_norm": 0.13894160091876984, "learning_rate": 1e-06, "loss": 0.0316, "num_tokens": 171323123.0, "reward": 0.5959821939468384, "reward_std": 0.20354530215263367, "rewards/simpleverify_reward/mean": 0.5959821343421936, "rewards/simpleverify_reward/std": 0.490975022315979, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3593.0, "completions/mean_length": 651.2545166015625, "completions/mean_terminated_length": 608.4384155273438, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 2.4758017492711373, "grad_norm": 0.1135711595416069, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 171995391.0, "reward": 0.6004464626312256, "reward_std": 0.15082666277885437, "rewards/simpleverify_reward/mean": 0.6004464030265808, "rewards/simpleverify_reward/std": 0.49008017778396606, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4084.0, "completions/mean_length": 648.5178833007812, "completions/mean_terminated_length": 569.8081665039062, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 2.485131195335277, "grad_norm": 0.11923548579216003, "learning_rate": 1e-06, "loss": 0.0554, "num_tokens": 172659807.0, "reward": 0.652901828289032, "reward_std": 0.1876543015241623, "rewards/simpleverify_reward/mean": 0.6529017686843872, "rewards/simpleverify_reward/std": 0.47631317377090454, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3666.0, "completions/mean_length": 630.755615234375, "completions/mean_terminated_length": 587.6847534179688, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 2.494460641399417, "grad_norm": 0.13553953170776367, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 173311172.0, "reward": 0.621651828289032, "reward_std": 0.19719935953617096, "rewards/simpleverify_reward/mean": 0.6216517686843872, "rewards/simpleverify_reward/std": 0.485245943069458, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3625.0, "completions/mean_length": 727.7734985351562, "completions/mean_terminated_length": 646.9359741210938, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 2.503790087463557, "grad_norm": 0.10823670029640198, "learning_rate": 1e-06, "loss": 0.0467, "num_tokens": 174062233.0, "reward": 0.5770089626312256, "reward_std": 0.17228296399116516, "rewards/simpleverify_reward/mean": 0.5770089030265808, "rewards/simpleverify_reward/std": 0.4943099319934845, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3221.0, "completions/mean_length": 634.6998291015625, "completions/mean_terminated_length": 583.7406616210938, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 2.513119533527697, "grad_norm": 0.12607285380363464, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 174715764.0, "reward": 0.637276828289032, "reward_std": 0.1714160293340683, "rewards/simpleverify_reward/mean": 0.6372767686843872, "rewards/simpleverify_reward/std": 0.481054425239563, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 601.5435791015625, "completions/mean_terminated_length": 581.933837890625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 2.522448979591837, "grad_norm": 0.11506806313991547, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 175348523.0, "reward": 0.590401828289032, "reward_std": 0.15788760781288147, "rewards/simpleverify_reward/mean": 0.5904017686843872, "rewards/simpleverify_reward/std": 0.49203425645828247, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3769.0, "completions/mean_length": 680.5022583007812, "completions/mean_terminated_length": 653.6085815429688, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 2.5317784256559768, "grad_norm": 0.1214267835021019, "learning_rate": 1e-06, "loss": 0.0306, "num_tokens": 176069253.0, "reward": 0.5691964626312256, "reward_std": 0.1878800094127655, "rewards/simpleverify_reward/mean": 0.5691964030265808, "rewards/simpleverify_reward/std": 0.4954652488231659, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2150.0, "completions/mean_length": 603.5826416015625, "completions/mean_terminated_length": 580.0382080078125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 2.5411078717201168, "grad_norm": 0.12562401592731476, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 176705151.0, "reward": 0.6127232313156128, "reward_std": 0.1544705480337143, "rewards/simpleverify_reward/mean": 0.6127232313156128, "rewards/simpleverify_reward/std": 0.4873998463153839, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3856.0, "completions/mean_length": 630.9564819335938, "completions/mean_terminated_length": 595.7981567382812, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 2.5504373177842563, "grad_norm": 0.13095131516456604, "learning_rate": 1e-06, "loss": 0.0491, "num_tokens": 177352624.0, "reward": 0.613839328289032, "reward_std": 0.21350222826004028, "rewards/simpleverify_reward/mean": 0.6138392686843872, "rewards/simpleverify_reward/std": 0.48714008927345276, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2764.0, "completions/mean_length": 750.4252319335938, "completions/mean_terminated_length": 677.944091796875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 2.5597667638483967, "grad_norm": 0.11880409717559814, "learning_rate": 1e-06, "loss": 0.0274, "num_tokens": 178109813.0, "reward": 0.5569196939468384, "reward_std": 0.20478273928165436, "rewards/simpleverify_reward/mean": 0.5569196343421936, "rewards/simpleverify_reward/std": 0.49702703952789307, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3642.0, "completions/mean_length": 658.1082763671875, "completions/mean_terminated_length": 607.4937744140625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 2.5690962099125363, "grad_norm": 0.12940101325511932, "learning_rate": 1e-06, "loss": 0.0151, "num_tokens": 178786558.0, "reward": 0.5625, "reward_std": 0.20095199346542358, "rewards/simpleverify_reward/mean": 0.5625, "rewards/simpleverify_reward/std": 0.49635544419288635, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2135.0, "completions/mean_length": 649.0491333007812, "completions/mean_terminated_length": 602.2579345703125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 2.5784256559766763, "grad_norm": 0.12126835435628891, "learning_rate": 1e-06, "loss": 0.0345, "num_tokens": 179465378.0, "reward": 0.6462053656578064, "reward_std": 0.16968964040279388, "rewards/simpleverify_reward/mean": 0.6462053656578064, "rewards/simpleverify_reward/std": 0.478413462638855, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3353.0, "completions/mean_length": 633.8605346679688, "completions/mean_terminated_length": 574.9137573242188, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 2.5877551020408163, "grad_norm": 0.10884279757738113, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 180120253.0, "reward": 0.6160714626312256, "reward_std": 0.15904805064201355, "rewards/simpleverify_reward/mean": 0.6160714030265808, "rewards/simpleverify_reward/std": 0.486612468957901, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3813.0, "completions/mean_length": 663.7767944335938, "completions/mean_terminated_length": 609.2970581054688, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 2.5970845481049563, "grad_norm": 0.12922215461730957, "learning_rate": 1e-06, "loss": 0.0349, "num_tokens": 180807589.0, "reward": 0.5546875, "reward_std": 0.20429165661334991, "rewards/simpleverify_reward/mean": 0.5546875, "rewards/simpleverify_reward/std": 0.4972778558731079, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2051.0, "completions/mean_length": 639.6830444335938, "completions/mean_terminated_length": 564.802734375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 2.6064139941690962, "grad_norm": 0.12574434280395508, "learning_rate": 1e-06, "loss": 0.0326, "num_tokens": 181467769.0, "reward": 0.6484375, "reward_std": 0.18419267237186432, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3447.0, "completions/mean_length": 626.318115234375, "completions/mean_terminated_length": 587.1569213867188, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 2.6157434402332362, "grad_norm": 0.13358303904533386, "learning_rate": 1e-06, "loss": 0.0213, "num_tokens": 182113334.0, "reward": 0.5970982313156128, "reward_std": 0.20245017111301422, "rewards/simpleverify_reward/mean": 0.5970982313156128, "rewards/simpleverify_reward/std": 0.4907552897930145, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 559.1942138671875, "completions/mean_terminated_length": 531.3453369140625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 2.6250728862973762, "grad_norm": 0.13495680689811707, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 182695292.0, "reward": 0.6863839626312256, "reward_std": 0.1702548861503601, "rewards/simpleverify_reward/mean": 0.6863839030265808, "rewards/simpleverify_reward/std": 0.46422144770622253, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 4081.0, "completions/mean_length": 627.6730346679688, "completions/mean_terminated_length": 584.5638427734375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 2.6344023323615158, "grad_norm": 0.13535043597221375, "learning_rate": 1e-06, "loss": 0.0398, "num_tokens": 183342351.0, "reward": 0.691964328289032, "reward_std": 0.19294539093971252, "rewards/simpleverify_reward/mean": 0.6919642686843872, "rewards/simpleverify_reward/std": 0.4619392454624176, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3701.0, "completions/mean_length": 628.6886596679688, "completions/mean_terminated_length": 569.65380859375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 2.643731778425656, "grad_norm": 0.14547421038150787, "learning_rate": 1e-06, "loss": 0.0324, "num_tokens": 183986168.0, "reward": 0.6428571939468384, "reward_std": 0.21771204471588135, "rewards/simpleverify_reward/mean": 0.6428571343421936, "rewards/simpleverify_reward/std": 0.4794250428676605, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3183.0, "completions/mean_length": 566.8828125, "completions/mean_terminated_length": 543.0910034179688, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 2.6530612244897958, "grad_norm": 0.14273953437805176, "learning_rate": 1e-06, "loss": 0.0301, "num_tokens": 184589351.0, "reward": 0.6741071939468384, "reward_std": 0.20313233137130737, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3710.0, "completions/mean_length": 628.7154541015625, "completions/mean_terminated_length": 609.2581787109375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 2.6623906705539357, "grad_norm": 0.1304483860731125, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 185242248.0, "reward": 0.598214328289032, "reward_std": 0.18419450521469116, "rewards/simpleverify_reward/mean": 0.5982142686843872, "rewards/simpleverify_reward/std": 0.49053290486335754, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3679.0, "completions/mean_length": 743.3203735351562, "completions/mean_terminated_length": 674.5866088867188, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 2.6717201166180757, "grad_norm": 0.11121264100074768, "learning_rate": 1e-06, "loss": 0.0187, "num_tokens": 185997223.0, "reward": 0.4933035969734192, "reward_std": 0.19599400460720062, "rewards/simpleverify_reward/mean": 0.4933035671710968, "rewards/simpleverify_reward/std": 0.5002344250679016, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025669642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 704.3828735351562, "completions/mean_terminated_length": 615.0275268554688, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 2.6810495626822157, "grad_norm": 0.12353788316249847, "learning_rate": 1e-06, "loss": 0.039, "num_tokens": 186722966.0, "reward": 0.5502232313156128, "reward_std": 0.18719784915447235, "rewards/simpleverify_reward/mean": 0.5502232313156128, "rewards/simpleverify_reward/std": 0.49774909019470215, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3318.0, "completions/mean_length": 648.279052734375, "completions/mean_terminated_length": 625.0359497070312, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 2.6903790087463557, "grad_norm": 0.13450922071933746, "learning_rate": 1e-06, "loss": 0.0076, "num_tokens": 187388672.0, "reward": 0.5948660969734192, "reward_std": 0.22037769854068756, "rewards/simpleverify_reward/mean": 0.5948660969734192, "rewards/simpleverify_reward/std": 0.49119213223457336, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 655.5558471679688, "completions/mean_terminated_length": 616.724609375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 2.6997084548104957, "grad_norm": 0.11345013231039047, "learning_rate": 1e-06, "loss": 0.0244, "num_tokens": 188060338.0, "reward": 0.59375, "reward_std": 0.1644599735736847, "rewards/simpleverify_reward/mean": 0.59375, "rewards/simpleverify_reward/std": 0.4914066195487976, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3745.0, "completions/mean_length": 625.046875, "completions/mean_terminated_length": 573.9456176757812, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 2.7090379008746357, "grad_norm": 0.13396288454532623, "learning_rate": 1e-06, "loss": 0.0278, "num_tokens": 188707828.0, "reward": 0.6183035969734192, "reward_std": 0.19456438720226288, "rewards/simpleverify_reward/mean": 0.6183035969734192, "rewards/simpleverify_reward/std": 0.4860740303993225, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 614.8069458007812, "completions/mean_terminated_length": 599.1962280273438, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 2.7183673469387752, "grad_norm": 0.12539811432361603, "learning_rate": 1e-06, "loss": 0.0292, "num_tokens": 189348991.0, "reward": 0.6004464626312256, "reward_std": 0.16537177562713623, "rewards/simpleverify_reward/mean": 0.6004464030265808, "rewards/simpleverify_reward/std": 0.49008017778396606, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3938.0, "completions/mean_length": 674.6998291015625, "completions/mean_terminated_length": 616.4483642578125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 2.7276967930029157, "grad_norm": 0.1277765929698944, "learning_rate": 1e-06, "loss": 0.037, "num_tokens": 190055890.0, "reward": 0.5546875, "reward_std": 0.19584135711193085, "rewards/simpleverify_reward/mean": 0.5546875, "rewards/simpleverify_reward/std": 0.4972778558731079, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3753.0, "completions/mean_length": 703.8705444335938, "completions/mean_terminated_length": 653.9297485351562, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 2.7370262390670552, "grad_norm": 0.12250377237796783, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 190777518.0, "reward": 0.574776828289032, "reward_std": 0.20200508832931519, "rewards/simpleverify_reward/mean": 0.5747767686843872, "rewards/simpleverify_reward/std": 0.49465295672416687, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 667.5792846679688, "completions/mean_terminated_length": 609.2066040039062, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 2.746355685131195, "grad_norm": 0.1231827586889267, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 191476501.0, "reward": 0.5680803656578064, "reward_std": 0.17359383404254913, "rewards/simpleverify_reward/mean": 0.5680803656578064, "rewards/simpleverify_reward/std": 0.4956200420856476, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3450.0, "completions/mean_length": 627.1808471679688, "completions/mean_terminated_length": 572.1201782226562, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 2.755685131195335, "grad_norm": 0.12909641861915588, "learning_rate": 1e-06, "loss": 0.0224, "num_tokens": 192124375.0, "reward": 0.660714328289032, "reward_std": 0.18802198767662048, "rewards/simpleverify_reward/mean": 0.6607142686843872, "rewards/simpleverify_reward/std": 0.4737313687801361, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2610.0, "completions/mean_length": 637.1942138671875, "completions/mean_terminated_length": 590.2421264648438, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 2.765014577259475, "grad_norm": 0.12549960613250732, "learning_rate": 1e-06, "loss": 0.0247, "num_tokens": 192783965.0, "reward": 0.6160714626312256, "reward_std": 0.17795519530773163, "rewards/simpleverify_reward/mean": 0.6160714030265808, "rewards/simpleverify_reward/std": 0.486612468957901, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 611.0904541015625, "completions/mean_terminated_length": 575.7305297851562, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 2.774344023323615, "grad_norm": 0.12233574688434601, "learning_rate": 1e-06, "loss": 0.0176, "num_tokens": 193419790.0, "reward": 0.5546875, "reward_std": 0.1658596247434616, "rewards/simpleverify_reward/mean": 0.5546875, "rewards/simpleverify_reward/std": 0.4972778558731079, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3364.0, "completions/mean_length": 633.4967041015625, "completions/mean_terminated_length": 610.1539306640625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 2.783673469387755, "grad_norm": 0.11561215668916702, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 194079675.0, "reward": 0.5736607313156128, "reward_std": 0.15424484014511108, "rewards/simpleverify_reward/mean": 0.5736607313156128, "rewards/simpleverify_reward/std": 0.4948205351829529, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 642.0, "completions/mean_terminated_length": 591.1483154296875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 2.793002915451895, "grad_norm": 0.1310715228319168, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 194742739.0, "reward": 0.5970982313156128, "reward_std": 0.19441144168376923, "rewards/simpleverify_reward/mean": 0.5970982313156128, "rewards/simpleverify_reward/std": 0.49075525999069214, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3960.0, "completions/mean_length": 698.4866333007812, "completions/mean_terminated_length": 628.833740234375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 2.8023323615160347, "grad_norm": 0.1288711428642273, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 195452543.0, "reward": 0.6350446939468384, "reward_std": 0.19084493815898895, "rewards/simpleverify_reward/mean": 0.6350446343421936, "rewards/simpleverify_reward/std": 0.4816865026950836, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3074.0, "completions/mean_length": 634.372802734375, "completions/mean_terminated_length": 563.405517578125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 2.811661807580175, "grad_norm": 0.1320972591638565, "learning_rate": 1e-06, "loss": 0.0445, "num_tokens": 196101589.0, "reward": 0.5881696939468384, "reward_std": 0.19235016405582428, "rewards/simpleverify_reward/mean": 0.5881696343421936, "rewards/simpleverify_reward/std": 0.4924395978450775, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 2632.0, "completions/mean_length": 663.450927734375, "completions/mean_terminated_length": 597.0648193359375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 2.8209912536443147, "grad_norm": 0.12523125112056732, "learning_rate": 1e-06, "loss": 0.0464, "num_tokens": 196779729.0, "reward": 0.5770089626312256, "reward_std": 0.19888116419315338, "rewards/simpleverify_reward/mean": 0.5770089030265808, "rewards/simpleverify_reward/std": 0.4943099319934845, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3340.0, "completions/mean_length": 688.6239013671875, "completions/mean_terminated_length": 665.65283203125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 2.8303206997084547, "grad_norm": 0.11440297961235046, "learning_rate": 1e-06, "loss": 0.0237, "num_tokens": 197486600.0, "reward": 0.5926339626312256, "reward_std": 0.18404294550418854, "rewards/simpleverify_reward/mean": 0.5926339030265808, "rewards/simpleverify_reward/std": 0.49161845445632935, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 629.771240234375, "completions/mean_terminated_length": 582.7183227539062, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 2.8396501457725947, "grad_norm": 0.1250227391719818, "learning_rate": 1e-06, "loss": 0.0418, "num_tokens": 198141899.0, "reward": 0.6662946939468384, "reward_std": 0.1778036504983902, "rewards/simpleverify_reward/mean": 0.6662946343421936, "rewards/simpleverify_reward/std": 0.47179925441741943, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 646.5234375, "completions/mean_terminated_length": 583.8056640625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 2.8489795918367347, "grad_norm": 0.11900141835212708, "learning_rate": 1e-06, "loss": 0.0281, "num_tokens": 198820448.0, "reward": 0.613839328289032, "reward_std": 0.18047183752059937, "rewards/simpleverify_reward/mean": 0.6138392686843872, "rewards/simpleverify_reward/std": 0.48714008927345276, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3417.0, "completions/mean_length": 625.8951416015625, "completions/mean_terminated_length": 574.8063354492188, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 2.8583090379008746, "grad_norm": 0.13671791553497314, "learning_rate": 1e-06, "loss": 0.0418, "num_tokens": 199464386.0, "reward": 0.606026828289032, "reward_std": 0.20406411588191986, "rewards/simpleverify_reward/mean": 0.6060267686843872, "rewards/simpleverify_reward/std": 0.48890194296836853, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3313.0, "completions/mean_length": 650.765625, "completions/mean_terminated_length": 592.1067504882812, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 2.8676384839650146, "grad_norm": 0.132360577583313, "learning_rate": 1e-06, "loss": 0.0124, "num_tokens": 200131032.0, "reward": 0.6071428656578064, "reward_std": 0.1908085197210312, "rewards/simpleverify_reward/mean": 0.6071428656578064, "rewards/simpleverify_reward/std": 0.48865827918052673, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3879.0, "completions/mean_length": 606.099365234375, "completions/mean_terminated_length": 566.7099609375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 2.8769679300291546, "grad_norm": 0.1351647526025772, "learning_rate": 1e-06, "loss": 0.0296, "num_tokens": 200760905.0, "reward": 0.6417410969734192, "reward_std": 0.15868036448955536, "rewards/simpleverify_reward/mean": 0.6417410969734192, "rewards/simpleverify_reward/std": 0.47975659370422363, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2747.0, "completions/mean_length": 586.5926513671875, "completions/mean_terminated_length": 570.8554077148438, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 2.8862973760932946, "grad_norm": 0.14149627089500427, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 201373628.0, "reward": 0.574776828289032, "reward_std": 0.17303113639354706, "rewards/simpleverify_reward/mean": 0.5747767686843872, "rewards/simpleverify_reward/std": 0.49465295672416687, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2103.0, "completions/mean_length": 662.625, "completions/mean_terminated_length": 623.8735961914062, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 2.8956268221574346, "grad_norm": 0.13529404997825623, "learning_rate": 1e-06, "loss": 0.0202, "num_tokens": 202052676.0, "reward": 0.5837053656578064, "reward_std": 0.22416704893112183, "rewards/simpleverify_reward/mean": 0.5837053656578064, "rewards/simpleverify_reward/std": 0.49321892857551575, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3498.0, "completions/mean_length": 623.9710083007812, "completions/mean_terminated_length": 552.7904663085938, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 2.904956268221574, "grad_norm": 0.1255316138267517, "learning_rate": 1e-06, "loss": 0.0292, "num_tokens": 202710938.0, "reward": 0.6729910969734192, "reward_std": 0.1785222589969635, "rewards/simpleverify_reward/mean": 0.6729910969734192, "rewards/simpleverify_reward/std": 0.46938255429267883, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 630.1015625, "completions/mean_terminated_length": 590.9830932617188, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 2.914285714285714, "grad_norm": 0.11100063472986221, "learning_rate": 1e-06, "loss": 0.0343, "num_tokens": 203365853.0, "reward": 0.6473214626312256, "reward_std": 0.139706090092659, "rewards/simpleverify_reward/mean": 0.6473214030265808, "rewards/simpleverify_reward/std": 0.47807058691978455, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 708.1261596679688, "completions/mean_terminated_length": 642.6040649414062, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 2.923615160349854, "grad_norm": 0.10849550366401672, "learning_rate": 1e-06, "loss": 0.0285, "num_tokens": 204093990.0, "reward": 0.5792410969734192, "reward_std": 0.1547301560640335, "rewards/simpleverify_reward/mean": 0.5792410969734192, "rewards/simpleverify_reward/std": 0.49395665526390076, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3672.0, "completions/mean_length": 678.0435791015625, "completions/mean_terminated_length": 639.4661254882812, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 2.932944606413994, "grad_norm": 0.12918327748775482, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 204790989.0, "reward": 0.546875, "reward_std": 0.18945305049419403, "rewards/simpleverify_reward/mean": 0.546875, "rewards/simpleverify_reward/std": 0.4980759024620056, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 630.125, "completions/mean_terminated_length": 594.958251953125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 2.942274052478134, "grad_norm": 0.1300196349620819, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 205450293.0, "reward": 0.5915178656578064, "reward_std": 0.18144895136356354, "rewards/simpleverify_reward/mean": 0.5915178656578064, "rewards/simpleverify_reward/std": 0.49182769656181335, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3784.0, "completions/mean_length": 598.3404541015625, "completions/mean_terminated_length": 578.7127075195312, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 2.951603498542274, "grad_norm": 0.1445261687040329, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 206081478.0, "reward": 0.640625, "reward_std": 0.1939670741558075, "rewards/simpleverify_reward/mean": 0.640625, "rewards/simpleverify_reward/std": 0.48008525371551514, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3797.0, "completions/mean_length": 700.7377319335938, "completions/mean_terminated_length": 654.648193359375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 2.960932944606414, "grad_norm": 0.12052702158689499, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 206796651.0, "reward": 0.5691964626312256, "reward_std": 0.1950158178806305, "rewards/simpleverify_reward/mean": 0.5691964030265808, "rewards/simpleverify_reward/std": 0.4954652488231659, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2277.0, "completions/mean_length": 596.8046875, "completions/mean_terminated_length": 553.3118896484375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 2.970262390670554, "grad_norm": 0.1431368589401245, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 207420540.0, "reward": 0.6104910969734192, "reward_std": 0.18321697413921356, "rewards/simpleverify_reward/mean": 0.6104910969734192, "rewards/simpleverify_reward/std": 0.48791125416755676, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.024553571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 685.5647583007812, "completions/mean_terminated_length": 599.718505859375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 2.979591836734694, "grad_norm": 0.1341426521539688, "learning_rate": 1e-06, "loss": 0.035, "num_tokens": 208124590.0, "reward": 0.609375, "reward_std": 0.17104442417621613, "rewards/simpleverify_reward/mean": 0.609375, "rewards/simpleverify_reward/std": 0.48816296458244324, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2635.0, "completions/mean_length": 639.5100708007812, "completions/mean_terminated_length": 604.4385375976562, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 2.9889212827988336, "grad_norm": 0.13848796486854553, "learning_rate": 1e-06, "loss": 0.0319, "num_tokens": 208794295.0, "reward": 0.6350446939468384, "reward_std": 0.21260391175746918, "rewards/simpleverify_reward/mean": 0.6350446343421936, "rewards/simpleverify_reward/std": 0.4816865026950836, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0028409090909090606, "completions/max_length": 4096.0, "completions/max_terminated_length": 2790.0, "completions/mean_length": 627.7755737304688, "completions/mean_terminated_length": 617.8945922851562, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 2.9982507288629736, "grad_norm": 0.1302075833082199, "learning_rate": 1e-06, "loss": 0.0162, "num_tokens": 209443566.0, "reward": 0.6439732313156128, "reward_std": 0.1923808455467224, "rewards/simpleverify_reward/mean": 0.6439732313156128, "rewards/simpleverify_reward/std": 0.47909072041511536, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3943.0, "completions/mean_length": 764.786865234375, "completions/mean_terminated_length": 696.4932250976562, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 3.00932944606414, "grad_norm": 0.12445256859064102, "learning_rate": 1e-06, "loss": 0.0378, "num_tokens": 210219103.0, "reward": 0.5167410969734192, "reward_std": 0.2021559327840805, "rewards/simpleverify_reward/mean": 0.5167410969734192, "rewards/simpleverify_reward/std": 0.4999987483024597, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3089.0, "completions/mean_length": 685.0647583007812, "completions/mean_terminated_length": 630.9229125976562, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 3.01865889212828, "grad_norm": 0.12259411066770554, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 210915417.0, "reward": 0.6328125, "reward_std": 0.1760030835866928, "rewards/simpleverify_reward/mean": 0.6328125, "rewards/simpleverify_reward/std": 0.48230743408203125, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3106.0, "completions/mean_length": 642.364990234375, "completions/mean_terminated_length": 611.2511596679688, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 3.02798833819242, "grad_norm": 0.11737199872732162, "learning_rate": 1e-06, "loss": 0.0087, "num_tokens": 211580640.0, "reward": 0.5870535969734192, "reward_std": 0.17246659100055695, "rewards/simpleverify_reward/mean": 0.5870535969734192, "rewards/simpleverify_reward/std": 0.49263837933540344, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3808.0, "completions/mean_length": 654.3392944335938, "completions/mean_terminated_length": 615.494384765625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 3.03731778425656, "grad_norm": 0.12266357243061066, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 212257520.0, "reward": 0.6082589626312256, "reward_std": 0.17690351605415344, "rewards/simpleverify_reward/mean": 0.6082589030265808, "rewards/simpleverify_reward/std": 0.4884119927883148, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2771.0, "completions/mean_length": 597.2879638671875, "completions/mean_terminated_length": 577.6543579101562, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 3.0466472303206995, "grad_norm": 0.1263723373413086, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 212872954.0, "reward": 0.6484375, "reward_std": 0.14496755599975586, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 664.0357666015625, "completions/mean_terminated_length": 633.1171264648438, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 3.0559766763848395, "grad_norm": 0.12582091987133026, "learning_rate": 1e-06, "loss": 0.0186, "num_tokens": 213560082.0, "reward": 0.5725446939468384, "reward_std": 0.17911705374717712, "rewards/simpleverify_reward/mean": 0.5725446343421936, "rewards/simpleverify_reward/std": 0.49498558044433594, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 655.1105346679688, "completions/mean_terminated_length": 600.4932250976562, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 3.0653061224489795, "grad_norm": 0.12499570846557617, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 214239813.0, "reward": 0.6205357313156128, "reward_std": 0.1673256903886795, "rewards/simpleverify_reward/mean": 0.6205357313156128, "rewards/simpleverify_reward/std": 0.4855247139930725, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3192.0, "completions/mean_length": 636.71875, "completions/mean_terminated_length": 569.815673828125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 3.0746355685131195, "grad_norm": 0.11107704788446426, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 214901121.0, "reward": 0.6383928656578064, "reward_std": 0.14263570308685303, "rewards/simpleverify_reward/mean": 0.6383928656578064, "rewards/simpleverify_reward/std": 0.4807341694831848, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2100.0, "completions/mean_length": 678.1596069335938, "completions/mean_terminated_length": 627.8402709960938, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 3.0839650145772595, "grad_norm": 0.1328374743461609, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 215604488.0, "reward": 0.535714328289032, "reward_std": 0.177999809384346, "rewards/simpleverify_reward/mean": 0.5357142686843872, "rewards/simpleverify_reward/std": 0.4990014135837555, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3884.0, "completions/mean_length": 646.7221069335938, "completions/mean_terminated_length": 591.9716796875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 3.0932944606413995, "grad_norm": 0.14378158748149872, "learning_rate": 1e-06, "loss": 0.0425, "num_tokens": 216270727.0, "reward": 0.6517857313156128, "reward_std": 0.21027062833309174, "rewards/simpleverify_reward/mean": 0.6517857313156128, "rewards/simpleverify_reward/std": 0.47667041420936584, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 606.450927734375, "completions/mean_terminated_length": 594.7279052734375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 3.1026239067055394, "grad_norm": 0.10776609927415848, "learning_rate": 1e-06, "loss": -0.005, "num_tokens": 216901323.0, "reward": 0.65625, "reward_std": 0.1277197003364563, "rewards/simpleverify_reward/mean": 0.65625, "rewards/simpleverify_reward/std": 0.4752241373062134, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 563.0022583007812, "completions/mean_terminated_length": 543.1762084960938, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 3.1119533527696794, "grad_norm": 0.13021297752857208, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 217501101.0, "reward": 0.6629464626312256, "reward_std": 0.1515413373708725, "rewards/simpleverify_reward/mean": 0.6629464030265808, "rewards/simpleverify_reward/std": 0.47296738624572754, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2970.0, "completions/mean_length": 679.1027221679688, "completions/mean_terminated_length": 616.9772338867188, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 3.1212827988338194, "grad_norm": 0.11123310029506683, "learning_rate": 1e-06, "loss": 0.0243, "num_tokens": 218210409.0, "reward": 0.6037946939468384, "reward_std": 0.13996751606464386, "rewards/simpleverify_reward/mean": 0.6037946343421936, "rewards/simpleverify_reward/std": 0.48938122391700745, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2827.0, "completions/mean_length": 617.208740234375, "completions/mean_terminated_length": 557.9784545898438, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 3.130612244897959, "grad_norm": 0.14170345664024353, "learning_rate": 1e-06, "loss": 0.0379, "num_tokens": 218844972.0, "reward": 0.6953125596046448, "reward_std": 0.18483206629753113, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3969.0, "completions/mean_length": 637.2199096679688, "completions/mean_terminated_length": 613.9022827148438, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 3.139941690962099, "grad_norm": 0.13810773193836212, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 219506257.0, "reward": 0.5770089626312256, "reward_std": 0.2022646963596344, "rewards/simpleverify_reward/mean": 0.5770089030265808, "rewards/simpleverify_reward/std": 0.4943099319934845, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3393.0, "completions/mean_length": 611.2701416015625, "completions/mean_terminated_length": 571.9390869140625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 3.149271137026239, "grad_norm": 0.13029102981090546, "learning_rate": 1e-06, "loss": 0.0268, "num_tokens": 220141259.0, "reward": 0.6796875596046448, "reward_std": 0.15898345410823822, "rewards/simpleverify_reward/mean": 0.6796875, "rewards/simpleverify_reward/std": 0.4668572247028351, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4012.0, "completions/mean_length": 777.4453735351562, "completions/mean_terminated_length": 697.7999877929688, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 3.158600583090379, "grad_norm": 0.11765523999929428, "learning_rate": 1e-06, "loss": 0.0253, "num_tokens": 220952434.0, "reward": 0.4877232313156128, "reward_std": 0.19718796014785767, "rewards/simpleverify_reward/mean": 0.4877232015132904, "rewards/simpleverify_reward/std": 0.500128448009491, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3955.0, "completions/mean_length": 673.0670166015625, "completions/mean_terminated_length": 630.5220336914062, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 3.167930029154519, "grad_norm": 0.11673524230718613, "learning_rate": 1e-06, "loss": 0.0363, "num_tokens": 221648030.0, "reward": 0.6149553656578064, "reward_std": 0.15244358777999878, "rewards/simpleverify_reward/mean": 0.6149553656578064, "rewards/simpleverify_reward/std": 0.4868776500225067, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3933.0, "completions/mean_length": 656.9933471679688, "completions/mean_terminated_length": 614.2485961914062, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 3.177259475218659, "grad_norm": 0.12957152724266052, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 222321592.0, "reward": 0.6439732313156128, "reward_std": 0.18032167851924896, "rewards/simpleverify_reward/mean": 0.6439732313156128, "rewards/simpleverify_reward/std": 0.47909072041511536, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3094.0, "completions/mean_length": 651.1428833007812, "completions/mean_terminated_length": 604.380126953125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 3.186588921282799, "grad_norm": 0.13614900410175323, "learning_rate": 1e-06, "loss": 0.049, "num_tokens": 222987688.0, "reward": 0.6194196939468384, "reward_std": 0.1756679117679596, "rewards/simpleverify_reward/mean": 0.6194196343421936, "rewards/simpleverify_reward/std": 0.48580074310302734, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3915.0, "completions/mean_length": 721.2645263671875, "completions/mean_terminated_length": 667.697265625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 3.195918367346939, "grad_norm": 0.12385595589876175, "learning_rate": 1e-06, "loss": 0.0374, "num_tokens": 223727413.0, "reward": 0.5915178656578064, "reward_std": 0.18517018854618073, "rewards/simpleverify_reward/mean": 0.5915178656578064, "rewards/simpleverify_reward/std": 0.49182769656181335, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3240.0, "completions/mean_length": 647.9420166015625, "completions/mean_terminated_length": 605.0847778320312, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 3.205247813411079, "grad_norm": 0.13304628431797028, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 224394073.0, "reward": 0.6484375, "reward_std": 0.16596952080726624, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3894.0, "completions/mean_length": 663.6819458007812, "completions/mean_terminated_length": 640.542724609375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 3.2145772594752184, "grad_norm": 0.12623722851276398, "learning_rate": 1e-06, "loss": 0.0159, "num_tokens": 225077292.0, "reward": 0.566964328289032, "reward_std": 0.16645735502243042, "rewards/simpleverify_reward/mean": 0.5669642686843872, "rewards/simpleverify_reward/std": 0.49577224254608154, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3777.0, "completions/mean_length": 715.2433471679688, "completions/mean_terminated_length": 677.0858154296875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 3.2239067055393584, "grad_norm": 0.13440150022506714, "learning_rate": 1e-06, "loss": 0.0207, "num_tokens": 225812934.0, "reward": 0.5792410969734192, "reward_std": 0.18554814159870148, "rewards/simpleverify_reward/mean": 0.5792410969734192, "rewards/simpleverify_reward/std": 0.49395665526390076, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3113.0, "completions/mean_length": 637.3125, "completions/mean_terminated_length": 598.275390625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 3.2332361516034984, "grad_norm": 0.12121071666479111, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 226467918.0, "reward": 0.6830357313156128, "reward_std": 0.13560616970062256, "rewards/simpleverify_reward/mean": 0.6830357313156128, "rewards/simpleverify_reward/std": 0.46555325388908386, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3882.0, "completions/mean_length": 648.03125, "completions/mean_terminated_length": 609.1151123046875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 3.2425655976676384, "grad_norm": 0.13155969977378845, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 227139786.0, "reward": 0.6037946939468384, "reward_std": 0.18077561259269714, "rewards/simpleverify_reward/mean": 0.6037946343421936, "rewards/simpleverify_reward/std": 0.48938122391700745, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2441.0, "completions/mean_length": 606.2545166015625, "completions/mean_terminated_length": 570.8455200195312, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 3.2518950437317784, "grad_norm": 0.13445456326007843, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 227774190.0, "reward": 0.6339285969734192, "reward_std": 0.1862214356660843, "rewards/simpleverify_reward/mean": 0.6339285969734192, "rewards/simpleverify_reward/std": 0.48199835419654846, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0267857142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3789.0, "completions/mean_length": 813.2344360351562, "completions/mean_terminated_length": 722.8829956054688, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 3.2612244897959184, "grad_norm": 0.12464670091867447, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 228597696.0, "reward": 0.5089285969734192, "reward_std": 0.2061375081539154, "rewards/simpleverify_reward/mean": 0.5089285969734192, "rewards/simpleverify_reward/std": 0.5001994967460632, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3082.0, "completions/mean_length": 678.755615234375, "completions/mean_terminated_length": 620.5732421875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 3.2705539358600584, "grad_norm": 0.12620891630649567, "learning_rate": 1e-06, "loss": 0.0307, "num_tokens": 229298325.0, "reward": 0.5837053656578064, "reward_std": 0.1564333289861679, "rewards/simpleverify_reward/mean": 0.5837053656578064, "rewards/simpleverify_reward/std": 0.49321892857551575, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3780.0, "completions/mean_length": 656.5167846679688, "completions/mean_terminated_length": 621.6177978515625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 3.2798833819241984, "grad_norm": 0.13385450839996338, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 229973220.0, "reward": 0.582589328289032, "reward_std": 0.1706339716911316, "rewards/simpleverify_reward/mean": 0.5825892686843872, "rewards/simpleverify_reward/std": 0.4934072494506836, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3339.0, "completions/mean_length": 740.5792846679688, "completions/mean_terminated_length": 683.4495239257812, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 3.2892128279883384, "grad_norm": 0.12300484627485275, "learning_rate": 1e-06, "loss": 0.0558, "num_tokens": 230731427.0, "reward": 0.598214328289032, "reward_std": 0.20083294808864594, "rewards/simpleverify_reward/mean": 0.5982142686843872, "rewards/simpleverify_reward/std": 0.49053287506103516, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 679.958740234375, "completions/mean_terminated_length": 605.950927734375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 3.298542274052478, "grad_norm": 0.1337970346212387, "learning_rate": 1e-06, "loss": 0.0365, "num_tokens": 231434190.0, "reward": 0.65625, "reward_std": 0.20316554605960846, "rewards/simpleverify_reward/mean": 0.65625, "rewards/simpleverify_reward/std": 0.4752241373062134, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 686.622802734375, "completions/mean_terminated_length": 644.246337890625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 3.307871720116618, "grad_norm": 0.1508442461490631, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 232140324.0, "reward": 0.566964328289032, "reward_std": 0.18918202817440033, "rewards/simpleverify_reward/mean": 0.5669642686843872, "rewards/simpleverify_reward/std": 0.49577224254608154, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3643.0, "completions/mean_length": 571.9542846679688, "completions/mean_terminated_length": 548.1966552734375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 3.317201166180758, "grad_norm": 0.12467329204082489, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 232730163.0, "reward": 0.6953125596046448, "reward_std": 0.13876289129257202, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3683.0, "completions/mean_length": 646.966552734375, "completions/mean_terminated_length": 623.714599609375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 3.326530612244898, "grad_norm": 0.15083983540534973, "learning_rate": 1e-06, "loss": 0.0232, "num_tokens": 233400605.0, "reward": 0.6540178656578064, "reward_std": 0.22244246304035187, "rewards/simpleverify_reward/mean": 0.6540178656578064, "rewards/simpleverify_reward/std": 0.4759531021118164, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2553.0, "completions/mean_length": 728.4799194335938, "completions/mean_terminated_length": 675.0272216796875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 3.335860058309038, "grad_norm": 0.1235504075884819, "learning_rate": 1e-06, "loss": 0.03, "num_tokens": 234150427.0, "reward": 0.566964328289032, "reward_std": 0.18265214562416077, "rewards/simpleverify_reward/mean": 0.5669642686843872, "rewards/simpleverify_reward/std": 0.49577224254608154, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3700.0, "completions/mean_length": 627.6886596679688, "completions/mean_terminated_length": 608.2256469726562, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 3.345189504373178, "grad_norm": 0.13009603321552277, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 234796604.0, "reward": 0.6473214626312256, "reward_std": 0.16720552742481232, "rewards/simpleverify_reward/mean": 0.6473214030265808, "rewards/simpleverify_reward/std": 0.47807058691978455, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3623.0, "completions/mean_length": 706.1484985351562, "completions/mean_terminated_length": 667.8883056640625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 3.354518950437318, "grad_norm": 0.12519454956054688, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 235524033.0, "reward": 0.6049107313156128, "reward_std": 0.18329043686389923, "rewards/simpleverify_reward/mean": 0.6049107313156128, "rewards/simpleverify_reward/std": 0.48914292454719543, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3308.0, "completions/mean_length": 660.2678833007812, "completions/mean_terminated_length": 609.6851196289062, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 3.363848396501458, "grad_norm": 0.13262206315994263, "learning_rate": 1e-06, "loss": 0.0277, "num_tokens": 236206545.0, "reward": 0.6272321939468384, "reward_std": 0.16608896851539612, "rewards/simpleverify_reward/mean": 0.6272321343421936, "rewards/simpleverify_reward/std": 0.4838111698627472, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2810.0, "completions/mean_length": 625.935302734375, "completions/mean_terminated_length": 594.6734619140625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 3.373177842565598, "grad_norm": 0.141575887799263, "learning_rate": 1e-06, "loss": 0.0213, "num_tokens": 236868103.0, "reward": 0.6551339626312256, "reward_std": 0.19065697491168976, "rewards/simpleverify_reward/mean": 0.6551339030265808, "rewards/simpleverify_reward/std": 0.4755900800228119, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3783.0, "completions/mean_length": 595.4207763671875, "completions/mean_terminated_length": 563.884033203125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 3.3825072886297374, "grad_norm": 0.1579931676387787, "learning_rate": 1e-06, "loss": 0.0376, "num_tokens": 237487752.0, "reward": 0.6651785969734192, "reward_std": 0.18359464406967163, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219157218933105, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3488.0, "completions/mean_length": 679.9241333007812, "completions/mean_terminated_length": 637.4644165039062, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 3.3918367346938774, "grad_norm": 0.14218293130397797, "learning_rate": 1e-06, "loss": 0.0273, "num_tokens": 238187060.0, "reward": 0.5558035969734192, "reward_std": 0.21485912799835205, "rewards/simpleverify_reward/mean": 0.5558035969734192, "rewards/simpleverify_reward/std": 0.49715372920036316, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3708.0, "completions/mean_length": 667.3248291015625, "completions/mean_terminated_length": 612.9013671875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 3.4011661807580174, "grad_norm": 0.11303266882896423, "learning_rate": 1e-06, "loss": 0.0253, "num_tokens": 238882831.0, "reward": 0.5837053656578064, "reward_std": 0.1464318335056305, "rewards/simpleverify_reward/mean": 0.5837053656578064, "rewards/simpleverify_reward/std": 0.49321895837783813, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 669.325927734375, "completions/mean_terminated_length": 626.7344970703125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 3.4104956268221573, "grad_norm": 0.13336755335330963, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 239567507.0, "reward": 0.6383928656578064, "reward_std": 0.17333604395389557, "rewards/simpleverify_reward/mean": 0.6383928656578064, "rewards/simpleverify_reward/std": 0.4807341992855072, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2345.0, "completions/mean_length": 685.521240234375, "completions/mean_terminated_length": 654.7962036132812, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 3.4198250728862973, "grad_norm": 0.11802571266889572, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 240283734.0, "reward": 0.566964328289032, "reward_std": 0.16604548692703247, "rewards/simpleverify_reward/mean": 0.5669642686843872, "rewards/simpleverify_reward/std": 0.49577224254608154, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3715.0, "completions/mean_length": 692.099365234375, "completions/mean_terminated_length": 649.7909545898438, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 3.4291545189504373, "grad_norm": 0.14002977311611176, "learning_rate": 1e-06, "loss": 0.0282, "num_tokens": 241002063.0, "reward": 0.578125, "reward_std": 0.18088480830192566, "rewards/simpleverify_reward/mean": 0.578125, "rewards/simpleverify_reward/std": 0.4941346049308777, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2933.0, "completions/mean_length": 585.5078125, "completions/mean_terminated_length": 569.7657470703125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 3.4384839650145773, "grad_norm": 0.14127209782600403, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 241610758.0, "reward": 0.707589328289032, "reward_std": 0.1575837880373001, "rewards/simpleverify_reward/mean": 0.7075892686843872, "rewards/simpleverify_reward/std": 0.45512402057647705, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3199.0, "completions/mean_length": 648.849365234375, "completions/mean_terminated_length": 594.1326904296875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 3.4478134110787173, "grad_norm": 0.126497283577919, "learning_rate": 1e-06, "loss": 0.0348, "num_tokens": 242292479.0, "reward": 0.6037946939468384, "reward_std": 0.17171771824359894, "rewards/simpleverify_reward/mean": 0.6037946343421936, "rewards/simpleverify_reward/std": 0.48938122391700745, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3457.0, "completions/mean_length": 698.3426513671875, "completions/mean_terminated_length": 656.1118774414062, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 3.4571428571428573, "grad_norm": 0.12549157440662384, "learning_rate": 1e-06, "loss": 0.0271, "num_tokens": 243004058.0, "reward": 0.6037946939468384, "reward_std": 0.15582603216171265, "rewards/simpleverify_reward/mean": 0.6037946343421936, "rewards/simpleverify_reward/std": 0.48938122391700745, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 590.2589721679688, "completions/mean_terminated_length": 566.6246948242188, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 3.466472303206997, "grad_norm": 0.14915801584720612, "learning_rate": 1e-06, "loss": 0.0252, "num_tokens": 243618138.0, "reward": 0.668526828289032, "reward_std": 0.18945486843585968, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 659.7355346679688, "completions/mean_terminated_length": 609.1449584960938, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 3.4758017492711373, "grad_norm": 0.14632278680801392, "learning_rate": 1e-06, "loss": 0.0316, "num_tokens": 244305253.0, "reward": 0.621651828289032, "reward_std": 0.20542213320732117, "rewards/simpleverify_reward/mean": 0.6216517686843872, "rewards/simpleverify_reward/std": 0.485245943069458, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3838.0, "completions/mean_length": 654.0424194335938, "completions/mean_terminated_length": 626.9404296875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 3.485131195335277, "grad_norm": 0.13217292726039886, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 244980643.0, "reward": 0.6227678656578064, "reward_std": 0.17217236757278442, "rewards/simpleverify_reward/mean": 0.6227678656578064, "rewards/simpleverify_reward/std": 0.4849644899368286, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4050.0, "completions/mean_length": 665.2098388671875, "completions/mean_terminated_length": 638.1957397460938, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 3.494460641399417, "grad_norm": 0.12632673978805542, "learning_rate": 1e-06, "loss": 0.0329, "num_tokens": 245674255.0, "reward": 0.6328125, "reward_std": 0.16273540258407593, "rewards/simpleverify_reward/mean": 0.6328125, "rewards/simpleverify_reward/std": 0.48230743408203125, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 643.2935791015625, "completions/mean_terminated_length": 627.810546875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 3.503790087463557, "grad_norm": 0.12387573719024658, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 246328622.0, "reward": 0.5870535969734192, "reward_std": 0.16029615700244904, "rewards/simpleverify_reward/mean": 0.5870535969734192, "rewards/simpleverify_reward/std": 0.49263837933540344, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 709.2779541015625, "completions/mean_terminated_length": 671.0530395507812, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 3.513119533527697, "grad_norm": 0.13564130663871765, "learning_rate": 1e-06, "loss": 0.038, "num_tokens": 247052551.0, "reward": 0.637276828289032, "reward_std": 0.19043055176734924, "rewards/simpleverify_reward/mean": 0.6372767686843872, "rewards/simpleverify_reward/std": 0.481054425239563, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3860.0, "completions/mean_length": 696.0234985351562, "completions/mean_terminated_length": 661.5253295898438, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 3.522448979591837, "grad_norm": 0.12313434481620789, "learning_rate": 1e-06, "loss": 0.012, "num_tokens": 247772484.0, "reward": 0.6238839626312256, "reward_std": 0.15210476517677307, "rewards/simpleverify_reward/mean": 0.6238839030265808, "rewards/simpleverify_reward/std": 0.4846802353858948, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3595.0, "completions/mean_length": 700.2221069335938, "completions/mean_terminated_length": 618.7234497070312, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 3.5317784256559768, "grad_norm": 0.14206165075302124, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 248494739.0, "reward": 0.582589328289032, "reward_std": 0.21289604902267456, "rewards/simpleverify_reward/mean": 0.5825892686843872, "rewards/simpleverify_reward/std": 0.493407279253006, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 655.568115234375, "completions/mean_terminated_length": 620.6594848632812, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 3.5411078717201168, "grad_norm": 0.13462576270103455, "learning_rate": 1e-06, "loss": 0.0204, "num_tokens": 249166768.0, "reward": 0.6506696939468384, "reward_std": 0.18870557844638824, "rewards/simpleverify_reward/mean": 0.6506696343421936, "rewards/simpleverify_reward/std": 0.47702476382255554, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3615.0, "completions/mean_length": 655.4464721679688, "completions/mean_terminated_length": 616.614013671875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 3.5504373177842563, "grad_norm": 0.1483190953731537, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 249854128.0, "reward": 0.6127232313156128, "reward_std": 0.16477374732494354, "rewards/simpleverify_reward/mean": 0.6127232313156128, "rewards/simpleverify_reward/std": 0.4873998463153839, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3109.0, "completions/mean_length": 613.359375, "completions/mean_terminated_length": 574.0519409179688, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 3.5597667638483967, "grad_norm": 0.14978662133216858, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 250486930.0, "reward": 0.6830357313156128, "reward_std": 0.18261894583702087, "rewards/simpleverify_reward/mean": 0.6830357313156128, "rewards/simpleverify_reward/std": 0.46555325388908386, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2556.0, "completions/mean_length": 626.5658569335938, "completions/mean_terminated_length": 599.2474975585938, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 3.5690962099125363, "grad_norm": 0.12795282900333405, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 251139165.0, "reward": 0.6049107313156128, "reward_std": 0.16055577993392944, "rewards/simpleverify_reward/mean": 0.6049107313156128, "rewards/simpleverify_reward/std": 0.48914292454719543, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2730.0, "completions/mean_length": 624.9676513671875, "completions/mean_terminated_length": 589.74853515625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 3.5784256559766763, "grad_norm": 0.1344139128923416, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 251795144.0, "reward": 0.6071428656578064, "reward_std": 0.17104442417621613, "rewards/simpleverify_reward/mean": 0.6071428656578064, "rewards/simpleverify_reward/std": 0.48865827918052673, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 607.4921875, "completions/mean_terminated_length": 572.0958251953125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 3.5877551020408163, "grad_norm": 0.14430510997772217, "learning_rate": 1e-06, "loss": 0.0389, "num_tokens": 252432977.0, "reward": 0.6640625, "reward_std": 0.16761668026447296, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3366.0, "completions/mean_length": 631.1596069335938, "completions/mean_terminated_length": 596.0033569335938, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 3.5970845481049563, "grad_norm": 0.14243726432323456, "learning_rate": 1e-06, "loss": 0.0356, "num_tokens": 253091768.0, "reward": 0.6316964626312256, "reward_std": 0.18773028254508972, "rewards/simpleverify_reward/mean": 0.6316964030265808, "rewards/simpleverify_reward/std": 0.4826137125492096, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3485.0, "completions/mean_length": 646.4699096679688, "completions/mean_terminated_length": 619.3082275390625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 3.6064139941690962, "grad_norm": 0.13004694879055023, "learning_rate": 1e-06, "loss": 0.02, "num_tokens": 253754293.0, "reward": 0.6004464626312256, "reward_std": 0.17761890590190887, "rewards/simpleverify_reward/mean": 0.6004464030265808, "rewards/simpleverify_reward/std": 0.49008017778396606, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2862.0, "completions/mean_length": 662.1105346679688, "completions/mean_terminated_length": 635.072021484375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 3.6157434402332362, "grad_norm": 0.1350228488445282, "learning_rate": 1e-06, "loss": 0.0334, "num_tokens": 254437928.0, "reward": 0.6383928656578064, "reward_std": 0.19835981726646423, "rewards/simpleverify_reward/mean": 0.6383928656578064, "rewards/simpleverify_reward/std": 0.4807341992855072, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3889.0, "completions/mean_length": 655.4296875, "completions/mean_terminated_length": 628.338623046875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 3.6250728862973762, "grad_norm": 0.13953839242458344, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 255114529.0, "reward": 0.582589328289032, "reward_std": 0.21804721653461456, "rewards/simpleverify_reward/mean": 0.5825892686843872, "rewards/simpleverify_reward/std": 0.4934072494506836, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2261.0, "completions/max_terminated_length": 2261.0, "completions/mean_length": 537.6183471679688, "completions/mean_terminated_length": 537.6183471679688, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 3.6344023323615158, "grad_norm": 0.13515165448188782, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 255685555.0, "reward": 0.6796875596046448, "reward_std": 0.12692874670028687, "rewards/simpleverify_reward/mean": 0.6796875, "rewards/simpleverify_reward/std": 0.4668572247028351, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2649.0, "completions/mean_length": 665.7210083007812, "completions/mean_terminated_length": 638.7109375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 3.643731778425656, "grad_norm": 0.13520081341266632, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 256362273.0, "reward": 0.5647321939468384, "reward_std": 0.2026105672121048, "rewards/simpleverify_reward/mean": 0.5647321343421936, "rewards/simpleverify_reward/std": 0.49606895446777344, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021205357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3731.0, "completions/mean_length": 684.0692138671875, "completions/mean_terminated_length": 610.1505126953125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 3.6530612244897958, "grad_norm": 0.11897967755794525, "learning_rate": 1e-06, "loss": 0.0403, "num_tokens": 257061743.0, "reward": 0.6662946939468384, "reward_std": 0.15721359848976135, "rewards/simpleverify_reward/mean": 0.6662946343421936, "rewards/simpleverify_reward/std": 0.47179922461509705, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3862.0, "completions/mean_length": 658.9810791015625, "completions/mean_terminated_length": 643.5684204101562, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 3.6623906705539357, "grad_norm": 0.13887372612953186, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 257751174.0, "reward": 0.5814732313156128, "reward_std": 0.16653333604335785, "rewards/simpleverify_reward/mean": 0.5814732313156128, "rewards/simpleverify_reward/std": 0.4935929775238037, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 580.1339721679688, "completions/mean_terminated_length": 560.404052734375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 3.6717201166180757, "grad_norm": 0.1309623420238495, "learning_rate": 1e-06, "loss": 0.0237, "num_tokens": 258366158.0, "reward": 0.676339328289032, "reward_std": 0.14661797881126404, "rewards/simpleverify_reward/mean": 0.6763392686843872, "rewards/simpleverify_reward/std": 0.4681335985660553, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3454.0, "completions/mean_length": 649.46875, "completions/mean_terminated_length": 637.8902587890625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 3.6810495626822157, "grad_norm": 0.1315292865037918, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 259040610.0, "reward": 0.6305803656578064, "reward_std": 0.19279633462429047, "rewards/simpleverify_reward/mean": 0.6305803656578064, "rewards/simpleverify_reward/std": 0.4829172194004059, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3165.0, "completions/mean_length": 640.1239013671875, "completions/mean_terminated_length": 612.9122924804688, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 3.6903790087463557, "grad_norm": 0.12550348043441772, "learning_rate": 1e-06, "loss": 0.0355, "num_tokens": 259698385.0, "reward": 0.590401828289032, "reward_std": 0.1548503190279007, "rewards/simpleverify_reward/mean": 0.5904017686843872, "rewards/simpleverify_reward/std": 0.49203425645828247, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2912.0, "completions/mean_length": 637.90625, "completions/mean_terminated_length": 586.9943237304688, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 3.6997084548104957, "grad_norm": 0.13938842713832855, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 260358893.0, "reward": 0.6462053656578064, "reward_std": 0.17877934873104095, "rewards/simpleverify_reward/mean": 0.6462053656578064, "rewards/simpleverify_reward/std": 0.478413462638855, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3888.0, "completions/mean_length": 622.2767944335938, "completions/mean_terminated_length": 583.0700073242188, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 3.7090379008746357, "grad_norm": 0.13376736640930176, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 261002813.0, "reward": 0.65625, "reward_std": 0.16852031648159027, "rewards/simpleverify_reward/mean": 0.65625, "rewards/simpleverify_reward/std": 0.4752241373062134, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3817.0, "completions/mean_length": 711.0201416015625, "completions/mean_terminated_length": 665.0701904296875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 3.7183673469387752, "grad_norm": 0.1278630495071411, "learning_rate": 1e-06, "loss": 0.0098, "num_tokens": 261730503.0, "reward": 0.5580357313156128, "reward_std": 0.1818607896566391, "rewards/simpleverify_reward/mean": 0.5580357313156128, "rewards/simpleverify_reward/std": 0.49689778685569763, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 605.7221069335938, "completions/mean_terminated_length": 566.3284301757812, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 3.7276967930029157, "grad_norm": 0.13094355165958405, "learning_rate": 1e-06, "loss": 0.0392, "num_tokens": 262353422.0, "reward": 0.691964328289032, "reward_std": 0.16751115024089813, "rewards/simpleverify_reward/mean": 0.6919642686843872, "rewards/simpleverify_reward/std": 0.4619392454624176, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4065.0, "completions/mean_length": 653.7265625, "completions/mean_terminated_length": 630.5202026367188, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 3.7370262390670552, "grad_norm": 0.13285601139068604, "learning_rate": 1e-06, "loss": 0.0314, "num_tokens": 263019145.0, "reward": 0.6729910969734192, "reward_std": 0.20516251027584076, "rewards/simpleverify_reward/mean": 0.6729910969734192, "rewards/simpleverify_reward/std": 0.46938255429267883, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2107.0, "completions/mean_length": 655.6975708007812, "completions/mean_terminated_length": 620.790283203125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 3.746355685131195, "grad_norm": 0.13895151019096375, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 263688314.0, "reward": 0.6729910969734192, "reward_std": 0.1680731475353241, "rewards/simpleverify_reward/mean": 0.6729910969734192, "rewards/simpleverify_reward/std": 0.46938255429267883, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2946.0, "completions/mean_length": 666.1908569335938, "completions/mean_terminated_length": 654.6685791015625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 3.755685131195335, "grad_norm": 0.13235919177532196, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 264370421.0, "reward": 0.6339285969734192, "reward_std": 0.15901736915111542, "rewards/simpleverify_reward/mean": 0.6339285969734192, "rewards/simpleverify_reward/std": 0.48199835419654846, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3615.0, "completions/mean_length": 640.2511596679688, "completions/mean_terminated_length": 605.1871337890625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 3.765014577259475, "grad_norm": 0.14702032506465912, "learning_rate": 1e-06, "loss": 0.0301, "num_tokens": 265024830.0, "reward": 0.6629464626312256, "reward_std": 0.20616962015628815, "rewards/simpleverify_reward/mean": 0.6629464030265808, "rewards/simpleverify_reward/std": 0.47296738624572754, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2771.0, "completions/mean_length": 662.2522583007812, "completions/mean_terminated_length": 642.9832153320312, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 3.774344023323615, "grad_norm": 0.1303194910287857, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 265702792.0, "reward": 0.6819196939468384, "reward_std": 0.17171771824359894, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3767.0, "completions/mean_length": 637.4855346679688, "completions/mean_terminated_length": 610.2531127929688, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 3.783673469387755, "grad_norm": 0.13998155295848846, "learning_rate": 1e-06, "loss": 0.0492, "num_tokens": 266375291.0, "reward": 0.6328125, "reward_std": 0.17826011776924133, "rewards/simpleverify_reward/mean": 0.6328125, "rewards/simpleverify_reward/std": 0.48230743408203125, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 642.5201416015625, "completions/mean_terminated_length": 611.4076538085938, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 3.793002915451895, "grad_norm": 0.1328468769788742, "learning_rate": 1e-06, "loss": 0.0302, "num_tokens": 267030853.0, "reward": 0.6071428656578064, "reward_std": 0.17656511068344116, "rewards/simpleverify_reward/mean": 0.6071428656578064, "rewards/simpleverify_reward/std": 0.48865827918052673, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3975.0, "completions/mean_length": 647.9576416015625, "completions/mean_terminated_length": 632.4955444335938, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 3.8023323615160347, "grad_norm": 0.1378229409456253, "learning_rate": 1e-06, "loss": 0.0265, "num_tokens": 267693255.0, "reward": 0.6417410969734192, "reward_std": 0.18464522063732147, "rewards/simpleverify_reward/mean": 0.6417410969734192, "rewards/simpleverify_reward/std": 0.47975656390190125, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3216.0, "completions/mean_length": 601.9576416015625, "completions/mean_terminated_length": 566.5050659179688, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 3.811661807580175, "grad_norm": 0.1365845501422882, "learning_rate": 1e-06, "loss": 0.0257, "num_tokens": 268311681.0, "reward": 0.6819196939468384, "reward_std": 0.15575045347213745, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 2256.0, "completions/mean_length": 738.7199096679688, "completions/mean_terminated_length": 673.7894897460938, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 3.8209912536443147, "grad_norm": 0.12372700124979019, "learning_rate": 1e-06, "loss": 0.0372, "num_tokens": 269060038.0, "reward": 0.5569196939468384, "reward_std": 0.19892321527004242, "rewards/simpleverify_reward/mean": 0.5569196343421936, "rewards/simpleverify_reward/std": 0.49702703952789307, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3984.0, "completions/mean_length": 696.974365234375, "completions/mean_terminated_length": 666.3524780273438, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 3.8303206997084547, "grad_norm": 0.10448765754699707, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 269776047.0, "reward": 0.574776828289032, "reward_std": 0.12076754868030548, "rewards/simpleverify_reward/mean": 0.5747767686843872, "rewards/simpleverify_reward/std": 0.49465295672416687, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2332.0, "completions/mean_length": 623.8939819335938, "completions/mean_terminated_length": 592.61376953125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 3.8396501457725947, "grad_norm": 0.13010179996490479, "learning_rate": 1e-06, "loss": 0.0324, "num_tokens": 270419872.0, "reward": 0.629464328289032, "reward_std": 0.15541373193264008, "rewards/simpleverify_reward/mean": 0.6294642686843872, "rewards/simpleverify_reward/std": 0.4832179844379425, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2096.0, "completions/mean_length": 626.5614013671875, "completions/mean_terminated_length": 603.1719360351562, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 3.8489795918367347, "grad_norm": 0.1389056146144867, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 271065847.0, "reward": 0.6462053656578064, "reward_std": 0.17258603870868683, "rewards/simpleverify_reward/mean": 0.6462053656578064, "rewards/simpleverify_reward/std": 0.478413462638855, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3628.0, "completions/mean_length": 559.239990234375, "completions/mean_terminated_length": 551.3277587890625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 3.8583090379008746, "grad_norm": 0.13765226304531097, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 271646206.0, "reward": 0.7287946939468384, "reward_std": 0.15424413979053497, "rewards/simpleverify_reward/mean": 0.7287946343421936, "rewards/simpleverify_reward/std": 0.44483017921447754, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 644.786865234375, "completions/mean_terminated_length": 621.5202026367188, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 3.8676384839650146, "grad_norm": 0.13416855037212372, "learning_rate": 1e-06, "loss": 0.0159, "num_tokens": 272319607.0, "reward": 0.6462053656578064, "reward_std": 0.16893896460533142, "rewards/simpleverify_reward/mean": 0.6462053656578064, "rewards/simpleverify_reward/std": 0.478413462638855, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4015.0, "completions/mean_length": 609.5435791015625, "completions/mean_terminated_length": 589.9786987304688, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 3.8769679300291546, "grad_norm": 0.14630787074565887, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 272949718.0, "reward": 0.6830357313156128, "reward_std": 0.19171005487442017, "rewards/simpleverify_reward/mean": 0.6830357313156128, "rewards/simpleverify_reward/std": 0.46555325388908386, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3636.0, "completions/mean_length": 734.2142944335938, "completions/mean_terminated_length": 669.19677734375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 3.8862973760932946, "grad_norm": 0.12442808598279953, "learning_rate": 1e-06, "loss": 0.026, "num_tokens": 273703678.0, "reward": 0.59375, "reward_std": 0.1628551334142685, "rewards/simpleverify_reward/mean": 0.59375, "rewards/simpleverify_reward/std": 0.4914066195487976, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3805.0, "completions/mean_length": 674.8292846679688, "completions/mean_terminated_length": 632.3062133789062, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 3.8956268221574346, "grad_norm": 0.13959965109825134, "learning_rate": 1e-06, "loss": 0.0358, "num_tokens": 274401533.0, "reward": 0.6015625, "reward_std": 0.18501752614974976, "rewards/simpleverify_reward/mean": 0.6015625, "rewards/simpleverify_reward/std": 0.48984986543655396, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3668.0, "completions/mean_length": 607.6897583007812, "completions/mean_terminated_length": 595.9708862304688, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 3.904956268221574, "grad_norm": 0.1354520618915558, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 275037535.0, "reward": 0.6774553656578064, "reward_std": 0.1770554929971695, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3832.0, "completions/mean_length": 720.6719360351562, "completions/mean_terminated_length": 659.30224609375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 3.914285714285714, "grad_norm": 0.14345231652259827, "learning_rate": 1e-06, "loss": 0.0327, "num_tokens": 275768465.0, "reward": 0.637276828289032, "reward_std": 0.22165042161941528, "rewards/simpleverify_reward/mean": 0.6372767686843872, "rewards/simpleverify_reward/std": 0.481054425239563, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3248.0, "completions/mean_length": 613.5201416015625, "completions/mean_terminated_length": 593.9776000976562, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 3.923615160349854, "grad_norm": 0.1423853486776352, "learning_rate": 1e-06, "loss": 0.0247, "num_tokens": 276406091.0, "reward": 0.660714328289032, "reward_std": 0.19020093977451324, "rewards/simpleverify_reward/mean": 0.6607142686843872, "rewards/simpleverify_reward/std": 0.4737313687801361, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 4011.0, "completions/mean_length": 701.5078735351562, "completions/mean_terminated_length": 663.1952514648438, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 3.932944606413994, "grad_norm": 0.1399390995502472, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 277127906.0, "reward": 0.578125, "reward_std": 0.17325934767723083, "rewards/simpleverify_reward/mean": 0.578125, "rewards/simpleverify_reward/std": 0.4941346049308777, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3567.0, "completions/mean_length": 692.5000610351562, "completions/mean_terminated_length": 642.391845703125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 3.942274052478134, "grad_norm": 0.12946492433547974, "learning_rate": 1e-06, "loss": 0.0412, "num_tokens": 277834394.0, "reward": 0.6183035969734192, "reward_std": 0.17599989473819733, "rewards/simpleverify_reward/mean": 0.6183035969734192, "rewards/simpleverify_reward/std": 0.4860740303993225, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 687.9453735351562, "completions/mean_terminated_length": 641.68212890625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 3.951603498542274, "grad_norm": 0.1327522099018097, "learning_rate": 1e-06, "loss": 0.0179, "num_tokens": 278541953.0, "reward": 0.5948660969734192, "reward_std": 0.16442856192588806, "rewards/simpleverify_reward/mean": 0.5948660969734192, "rewards/simpleverify_reward/std": 0.49119213223457336, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2670.0, "completions/mean_length": 613.404052734375, "completions/mean_terminated_length": 578.067626953125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 3.960932944606414, "grad_norm": 0.15041416883468628, "learning_rate": 1e-06, "loss": 0.0345, "num_tokens": 279174075.0, "reward": 0.6696428656578064, "reward_std": 0.19948774576187134, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.47060438990592957, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3603.0, "completions/mean_length": 703.6964721679688, "completions/mean_terminated_length": 665.4085693359375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 3.970262390670554, "grad_norm": 0.14025114476680756, "learning_rate": 1e-06, "loss": 0.0467, "num_tokens": 279893179.0, "reward": 0.640625, "reward_std": 0.17562514543533325, "rewards/simpleverify_reward/mean": 0.640625, "rewards/simpleverify_reward/std": 0.48008525371551514, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3727.0, "completions/mean_length": 646.7288208007812, "completions/mean_terminated_length": 607.7979736328125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 3.979591836734694, "grad_norm": 0.1326093226671219, "learning_rate": 1e-06, "loss": 0.0305, "num_tokens": 280562928.0, "reward": 0.6506696939468384, "reward_std": 0.15706273913383484, "rewards/simpleverify_reward/mean": 0.6506696343421936, "rewards/simpleverify_reward/std": 0.47702476382255554, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 686.9464721679688, "completions/mean_terminated_length": 624.963623046875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 3.9889212827988336, "grad_norm": 0.1411678045988083, "learning_rate": 1e-06, "loss": 0.0243, "num_tokens": 281268800.0, "reward": 0.668526828289032, "reward_std": 0.17107833921909332, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005681818181818232, "completions/max_length": 4096.0, "completions/max_terminated_length": 3372.0, "completions/mean_length": 602.8267211914062, "completions/mean_terminated_length": 582.86572265625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 3.9982507288629736, "grad_norm": 0.14975613355636597, "learning_rate": 1e-06, "loss": 0.0434, "num_tokens": 281928103.0, "reward": 0.6752232313156128, "reward_std": 0.16319075226783752, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3901.0, "completions/mean_length": 636.4185791015625, "completions/mean_terminated_length": 605.2511596679688, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 4.0093294460641395, "grad_norm": 0.12361966073513031, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 282592430.0, "reward": 0.6640625, "reward_std": 0.14725668728351593, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 638.140625, "completions/mean_terminated_length": 622.6345825195312, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 4.01865889212828, "grad_norm": 0.14525309205055237, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 283248940.0, "reward": 0.6350446939468384, "reward_std": 0.18118861317634583, "rewards/simpleverify_reward/mean": 0.6350446343421936, "rewards/simpleverify_reward/std": 0.481686532497406, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2863.0, "completions/mean_length": 682.4174194335938, "completions/mean_terminated_length": 632.1608276367188, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 4.0279883381924195, "grad_norm": 0.1270572692155838, "learning_rate": 1e-06, "loss": 0.0306, "num_tokens": 283938778.0, "reward": 0.640625, "reward_std": 0.15417702496051788, "rewards/simpleverify_reward/mean": 0.640625, "rewards/simpleverify_reward/std": 0.48008525371551514, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3081.0, "completions/mean_length": 682.8114013671875, "completions/mean_terminated_length": 632.560546875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 4.03731778425656, "grad_norm": 0.12802864611148834, "learning_rate": 1e-06, "loss": 0.0404, "num_tokens": 284636561.0, "reward": 0.6428571939468384, "reward_std": 0.19407470524311066, "rewards/simpleverify_reward/mean": 0.6428571343421936, "rewards/simpleverify_reward/std": 0.47942501306533813, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3099.0, "completions/mean_length": 700.0848388671875, "completions/mean_terminated_length": 661.7562255859375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 4.0466472303206995, "grad_norm": 0.12400411069393158, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 285348893.0, "reward": 0.6127232313156128, "reward_std": 0.15181052684783936, "rewards/simpleverify_reward/mean": 0.6127232313156128, "rewards/simpleverify_reward/std": 0.4873998463153839, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3855.0, "completions/mean_length": 719.2935791015625, "completions/mean_terminated_length": 669.579833984375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 4.05597667638484, "grad_norm": 0.16745637357234955, "learning_rate": 1e-06, "loss": 0.0318, "num_tokens": 286093252.0, "reward": 0.6071428656578064, "reward_std": 0.22567518055438995, "rewards/simpleverify_reward/mean": 0.6071428656578064, "rewards/simpleverify_reward/std": 0.48865824937820435, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3311.0, "completions/mean_length": 614.8471069335938, "completions/mean_terminated_length": 567.5916748046875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 4.0653061224489795, "grad_norm": 0.13166189193725586, "learning_rate": 1e-06, "loss": 0.0406, "num_tokens": 286728155.0, "reward": 0.6729910969734192, "reward_std": 0.16153216361999512, "rewards/simpleverify_reward/mean": 0.6729910969734192, "rewards/simpleverify_reward/std": 0.46938255429267883, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2217.0, "completions/mean_length": 684.3616333007812, "completions/mean_terminated_length": 626.2747192382812, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 4.07463556851312, "grad_norm": 0.12804366648197174, "learning_rate": 1e-06, "loss": 0.0391, "num_tokens": 287431647.0, "reward": 0.6674107313156128, "reward_std": 0.16724900901317596, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 676.3326416015625, "completions/mean_terminated_length": 629.9118041992188, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 4.0839650145772595, "grad_norm": 0.13588641583919525, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 288127297.0, "reward": 0.6037946939468384, "reward_std": 0.17295445501804352, "rewards/simpleverify_reward/mean": 0.6037946343421936, "rewards/simpleverify_reward/std": 0.48938122391700745, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3537.0, "completions/mean_length": 736.9006958007812, "completions/mean_terminated_length": 679.7083129882812, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 4.093294460641399, "grad_norm": 0.13638503849506378, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 288868768.0, "reward": 0.6205357313156128, "reward_std": 0.19046194851398468, "rewards/simpleverify_reward/mean": 0.6205357313156128, "rewards/simpleverify_reward/std": 0.4855247139930725, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3596.0, "completions/mean_length": 709.6138916015625, "completions/mean_terminated_length": 675.253662109375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 4.1026239067055394, "grad_norm": 0.13001509010791779, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 289595182.0, "reward": 0.5859375, "reward_std": 0.16939541697502136, "rewards/simpleverify_reward/mean": 0.5859375, "rewards/simpleverify_reward/std": 0.4928344786167145, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3012.0, "completions/mean_length": 722.3504638671875, "completions/mean_terminated_length": 691.9572143554688, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 4.111953352769679, "grad_norm": 0.11731571704149246, "learning_rate": 1e-06, "loss": 0.0266, "num_tokens": 290324496.0, "reward": 0.5881696939468384, "reward_std": 0.17408238351345062, "rewards/simpleverify_reward/mean": 0.5881696343421936, "rewards/simpleverify_reward/std": 0.4924395978450775, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 710.7232666015625, "completions/mean_terminated_length": 660.8833618164062, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 4.121282798833819, "grad_norm": 0.12428674101829529, "learning_rate": 1e-06, "loss": 0.0492, "num_tokens": 291045024.0, "reward": 0.6540178656578064, "reward_std": 0.16675792634487152, "rewards/simpleverify_reward/mean": 0.6540178656578064, "rewards/simpleverify_reward/std": 0.4759531021118164, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2880.0, "completions/mean_length": 694.5714721679688, "completions/mean_terminated_length": 660.05859375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 4.130612244897959, "grad_norm": 0.14222826063632965, "learning_rate": 1e-06, "loss": 0.0258, "num_tokens": 291757656.0, "reward": 0.621651828289032, "reward_std": 0.19456368684768677, "rewards/simpleverify_reward/mean": 0.6216517686843872, "rewards/simpleverify_reward/std": 0.4852459728717804, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 658.7745971679688, "completions/mean_terminated_length": 623.8984985351562, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 4.139941690962099, "grad_norm": 0.12578943371772766, "learning_rate": 1e-06, "loss": 0.0429, "num_tokens": 292446278.0, "reward": 0.6205357313156128, "reward_std": 0.14489158987998962, "rewards/simpleverify_reward/mean": 0.6205357313156128, "rewards/simpleverify_reward/std": 0.4855247139930725, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 653.7377319335938, "completions/mean_terminated_length": 610.9525756835938, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 4.149271137026239, "grad_norm": 0.12265264987945557, "learning_rate": 1e-06, "loss": 0.0124, "num_tokens": 293129131.0, "reward": 0.6462053656578064, "reward_std": 0.13512718677520752, "rewards/simpleverify_reward/mean": 0.6462053656578064, "rewards/simpleverify_reward/std": 0.478413462638855, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 649.5324096679688, "completions/mean_terminated_length": 614.5625610351562, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 4.158600583090379, "grad_norm": 0.15088096261024475, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 293802120.0, "reward": 0.6696428656578064, "reward_std": 0.19523699581623077, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.47060438990592957, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2335.0, "completions/mean_length": 634.0491333007812, "completions/mean_terminated_length": 610.7101440429688, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 4.167930029154519, "grad_norm": 0.13693316280841827, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 294456252.0, "reward": 0.65625, "reward_std": 0.17133864760398865, "rewards/simpleverify_reward/mean": 0.65625, "rewards/simpleverify_reward/std": 0.4752241373062134, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3561.0, "completions/mean_length": 687.5145263671875, "completions/mean_terminated_length": 652.9300537109375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 4.1772594752186585, "grad_norm": 0.15323440730571747, "learning_rate": 1e-06, "loss": 0.0318, "num_tokens": 295162673.0, "reward": 0.629464328289032, "reward_std": 0.187199667096138, "rewards/simpleverify_reward/mean": 0.6294642686843872, "rewards/simpleverify_reward/std": 0.4832179844379425, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3792.0, "completions/mean_length": 665.3326416015625, "completions/mean_terminated_length": 630.5230712890625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 4.186588921282799, "grad_norm": 0.12962469458580017, "learning_rate": 1e-06, "loss": 0.0354, "num_tokens": 295854915.0, "reward": 0.6305803656578064, "reward_std": 0.16686992347240448, "rewards/simpleverify_reward/mean": 0.6305803656578064, "rewards/simpleverify_reward/std": 0.4829172194004059, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3627.0, "completions/mean_length": 691.2891235351562, "completions/mean_terminated_length": 637.2460327148438, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 4.1959183673469385, "grad_norm": 0.13411417603492737, "learning_rate": 1e-06, "loss": 0.0379, "num_tokens": 296563430.0, "reward": 0.6316964626312256, "reward_std": 0.16863909363746643, "rewards/simpleverify_reward/mean": 0.6316964030265808, "rewards/simpleverify_reward/std": 0.4826137125492096, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2871.0, "completions/mean_length": 634.2545166015625, "completions/mean_terminated_length": 614.8283081054688, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 4.205247813411079, "grad_norm": 0.13314686715602875, "learning_rate": 1e-06, "loss": 0.0186, "num_tokens": 297220058.0, "reward": 0.6417410969734192, "reward_std": 0.16784563660621643, "rewards/simpleverify_reward/mean": 0.6417410969734192, "rewards/simpleverify_reward/std": 0.47975656390190125, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2414.0, "completions/mean_length": 651.0826416015625, "completions/mean_terminated_length": 639.509521484375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 4.214577259475218, "grad_norm": 0.15532329678535461, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 297894948.0, "reward": 0.6640625, "reward_std": 0.20854607224464417, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 663.693115234375, "completions/mean_terminated_length": 609.2120361328125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 4.223906705539359, "grad_norm": 0.11494933068752289, "learning_rate": 1e-06, "loss": 0.0347, "num_tokens": 298576585.0, "reward": 0.6194196939468384, "reward_std": 0.14237426221370697, "rewards/simpleverify_reward/mean": 0.6194196343421936, "rewards/simpleverify_reward/std": 0.48580074310302734, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3098.0, "completions/mean_length": 720.0111694335938, "completions/mean_terminated_length": 674.1832885742188, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 4.233236151603498, "grad_norm": 0.12973584234714508, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 299315595.0, "reward": 0.590401828289032, "reward_std": 0.17607727646827698, "rewards/simpleverify_reward/mean": 0.5904017686843872, "rewards/simpleverify_reward/std": 0.49203425645828247, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3678.0, "completions/mean_length": 653.6339721679688, "completions/mean_terminated_length": 614.7810668945312, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 4.242565597667639, "grad_norm": 0.14326335489749908, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 299998019.0, "reward": 0.6417410969734192, "reward_std": 0.19170823693275452, "rewards/simpleverify_reward/mean": 0.6417410969734192, "rewards/simpleverify_reward/std": 0.47975656390190125, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2705.0, "completions/mean_length": 668.1640625, "completions/mean_terminated_length": 625.5582275390625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 4.251895043731778, "grad_norm": 0.13656644523143768, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 300681678.0, "reward": 0.6808035969734192, "reward_std": 0.17595891654491425, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.4664256274700165, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 646.3660888671875, "completions/mean_terminated_length": 619.20361328125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 4.261224489795918, "grad_norm": 0.14076030254364014, "learning_rate": 1e-06, "loss": 0.0323, "num_tokens": 301345654.0, "reward": 0.6350446939468384, "reward_std": 0.17912593483924866, "rewards/simpleverify_reward/mean": 0.6350446343421936, "rewards/simpleverify_reward/std": 0.481686532497406, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3112.0, "completions/mean_length": 688.786865234375, "completions/mean_terminated_length": 626.8374633789062, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 4.270553935860058, "grad_norm": 0.13203319907188416, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 302048439.0, "reward": 0.6238839626312256, "reward_std": 0.16288764774799347, "rewards/simpleverify_reward/mean": 0.6238839030265808, "rewards/simpleverify_reward/std": 0.4846802353858948, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3742.0, "completions/mean_length": 694.3092041015625, "completions/mean_terminated_length": 659.7936401367188, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 4.279883381924198, "grad_norm": 0.1184072196483612, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 302756212.0, "reward": 0.6071428656578064, "reward_std": 0.15973231196403503, "rewards/simpleverify_reward/mean": 0.6071428656578064, "rewards/simpleverify_reward/std": 0.48865827918052673, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 649.5480346679688, "completions/mean_terminated_length": 630.2076416015625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 4.289212827988338, "grad_norm": 0.12238530814647675, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 303426983.0, "reward": 0.6428571939468384, "reward_std": 0.15872059762477875, "rewards/simpleverify_reward/mean": 0.6428571343421936, "rewards/simpleverify_reward/std": 0.4794250428676605, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3377.0, "completions/mean_length": 704.0826416015625, "completions/mean_terminated_length": 661.9231567382812, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 4.298542274052478, "grad_norm": 0.1206512302160263, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 304158065.0, "reward": 0.59375, "reward_std": 0.1588318943977356, "rewards/simpleverify_reward/mean": 0.59375, "rewards/simpleverify_reward/std": 0.4914066195487976, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3658.0, "completions/mean_length": 714.6953735351562, "completions/mean_terminated_length": 684.233154296875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 4.307871720116618, "grad_norm": 0.1541004627943039, "learning_rate": 1e-06, "loss": 0.0512, "num_tokens": 304889784.0, "reward": 0.625, "reward_std": 0.220902681350708, "rewards/simpleverify_reward/mean": 0.625, "rewards/simpleverify_reward/std": 0.48439329862594604, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3397.0, "completions/mean_length": 646.0379638671875, "completions/mean_terminated_length": 622.77978515625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 4.317201166180758, "grad_norm": 0.14272978901863098, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 305563290.0, "reward": 0.6863839626312256, "reward_std": 0.15837474167346954, "rewards/simpleverify_reward/mean": 0.6863839030265808, "rewards/simpleverify_reward/std": 0.46422141790390015, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 655.8527221679688, "completions/mean_terminated_length": 624.8603515625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 4.326530612244898, "grad_norm": 0.14344608783721924, "learning_rate": 1e-06, "loss": 0.012, "num_tokens": 306241462.0, "reward": 0.613839328289032, "reward_std": 0.17795519530773163, "rewards/simpleverify_reward/mean": 0.6138392686843872, "rewards/simpleverify_reward/std": 0.48714008927345276, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3815.0, "completions/mean_length": 645.9486694335938, "completions/mean_terminated_length": 634.3583374023438, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 4.335860058309038, "grad_norm": 0.1184195801615715, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 306907048.0, "reward": 0.6785714626312256, "reward_std": 0.12441141903400421, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3697.0, "completions/mean_length": 624.9230346679688, "completions/mean_terminated_length": 573.8199462890625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 4.345189504373177, "grad_norm": 0.14943735301494598, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 307553923.0, "reward": 0.7120535969734192, "reward_std": 0.12847179174423218, "rewards/simpleverify_reward/mean": 0.7120535969734192, "rewards/simpleverify_reward/std": 0.4530589282512665, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2752.0, "completions/mean_length": 615.7467041015625, "completions/mean_terminated_length": 592.2843017578125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 4.354518950437318, "grad_norm": 0.1307288259267807, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 308193280.0, "reward": 0.6830357313156128, "reward_std": 0.14995694160461426, "rewards/simpleverify_reward/mean": 0.6830357313156128, "rewards/simpleverify_reward/std": 0.46555325388908386, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2838.0, "completions/mean_length": 603.5335083007812, "completions/mean_terminated_length": 576.0337524414062, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 4.363848396501457, "grad_norm": 0.13896164298057556, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 308821510.0, "reward": 0.6383928656578064, "reward_std": 0.17160853743553162, "rewards/simpleverify_reward/mean": 0.6383928656578064, "rewards/simpleverify_reward/std": 0.4807341992855072, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3107.0, "completions/mean_length": 644.2120971679688, "completions/mean_terminated_length": 624.841796875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 4.373177842565598, "grad_norm": 0.13675406575202942, "learning_rate": 1e-06, "loss": 0.0314, "num_tokens": 309484348.0, "reward": 0.6930803656578064, "reward_std": 0.150902658700943, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147337555885315, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3410.0, "completions/mean_length": 677.8270263671875, "completions/mean_terminated_length": 650.9122924804688, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 4.382507288629737, "grad_norm": 0.1313118040561676, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 310181601.0, "reward": 0.6082589626312256, "reward_std": 0.15454471111297607, "rewards/simpleverify_reward/mean": 0.6082589030265808, "rewards/simpleverify_reward/std": 0.4884119927883148, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3853.0, "completions/mean_length": 697.4631958007812, "completions/mean_terminated_length": 666.8457641601562, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 4.391836734693878, "grad_norm": 0.13013240694999695, "learning_rate": 1e-06, "loss": 0.0408, "num_tokens": 310902544.0, "reward": 0.5870535969734192, "reward_std": 0.1608920842409134, "rewards/simpleverify_reward/mean": 0.5870535969734192, "rewards/simpleverify_reward/std": 0.49263837933540344, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 692.0848388671875, "completions/mean_terminated_length": 657.5467529296875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 4.401166180758017, "grad_norm": 0.1385337859392166, "learning_rate": 1e-06, "loss": 0.0302, "num_tokens": 311620284.0, "reward": 0.621651828289032, "reward_std": 0.17754262685775757, "rewards/simpleverify_reward/mean": 0.6216517686843872, "rewards/simpleverify_reward/std": 0.4852459728717804, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 631.296875, "completions/mean_terminated_length": 596.1420288085938, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 4.410495626822158, "grad_norm": 0.15034440159797668, "learning_rate": 1e-06, "loss": 0.0335, "num_tokens": 312277374.0, "reward": 0.6205357313156128, "reward_std": 0.19080963730812073, "rewards/simpleverify_reward/mean": 0.6205357313156128, "rewards/simpleverify_reward/std": 0.4855247139930725, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 670.364990234375, "completions/mean_terminated_length": 612.0397338867188, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 4.419825072886297, "grad_norm": 0.15315893292427063, "learning_rate": 1e-06, "loss": 0.0358, "num_tokens": 312969845.0, "reward": 0.6261160969734192, "reward_std": 0.20200397074222565, "rewards/simpleverify_reward/mean": 0.6261160969734192, "rewards/simpleverify_reward/std": 0.48410359025001526, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3147.0, "completions/mean_length": 664.5614013671875, "completions/mean_terminated_length": 649.173828125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 4.429154518950437, "grad_norm": 0.1413564831018448, "learning_rate": 1e-06, "loss": 0.0168, "num_tokens": 313656260.0, "reward": 0.6439732313156128, "reward_std": 0.17314878106117249, "rewards/simpleverify_reward/mean": 0.6439732313156128, "rewards/simpleverify_reward/std": 0.47909072041511536, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3847.0, "completions/mean_length": 725.2846069335938, "completions/mean_terminated_length": 687.2404174804688, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 4.438483965014577, "grad_norm": 0.11348707228899002, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 314397363.0, "reward": 0.5714285969734192, "reward_std": 0.14432887732982635, "rewards/simpleverify_reward/mean": 0.5714285969734192, "rewards/simpleverify_reward/std": 0.49514803290367126, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2526.0, "completions/mean_length": 624.2455444335938, "completions/mean_terminated_length": 600.8404541015625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 4.447813411078717, "grad_norm": 0.1413491666316986, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 315045695.0, "reward": 0.6863839626312256, "reward_std": 0.1464318335056305, "rewards/simpleverify_reward/mean": 0.6863839030265808, "rewards/simpleverify_reward/std": 0.4642214775085449, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3555.0, "completions/mean_length": 605.78125, "completions/mean_terminated_length": 586.1953125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 4.457142857142857, "grad_norm": 0.15461480617523193, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 315674315.0, "reward": 0.6964285969734192, "reward_std": 0.18159978091716766, "rewards/simpleverify_reward/mean": 0.6964285969734192, "rewards/simpleverify_reward/std": 0.4600565731525421, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2962.0, "completions/mean_length": 662.8538208007812, "completions/mean_terminated_length": 643.588134765625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 4.466472303206997, "grad_norm": 0.1429339200258255, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 316355728.0, "reward": 0.6495535969734192, "reward_std": 0.19317178428173065, "rewards/simpleverify_reward/mean": 0.6495535969734192, "rewards/simpleverify_reward/std": 0.47737622261047363, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3925.0, "completions/mean_length": 643.9185791015625, "completions/mean_terminated_length": 620.6460571289062, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 4.475801749271137, "grad_norm": 0.16214551031589508, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 317023463.0, "reward": 0.676339328289032, "reward_std": 0.200877845287323, "rewards/simpleverify_reward/mean": 0.6763392686843872, "rewards/simpleverify_reward/std": 0.4681335985660553, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3238.0, "completions/mean_length": 689.310302734375, "completions/mean_terminated_length": 670.1930541992188, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 4.485131195335277, "grad_norm": 0.15121741592884064, "learning_rate": 1e-06, "loss": 0.0315, "num_tokens": 317734909.0, "reward": 0.6004464626312256, "reward_std": 0.15838289260864258, "rewards/simpleverify_reward/mean": 0.6004464030265808, "rewards/simpleverify_reward/std": 0.49008017778396606, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3855.0, "completions/mean_length": 625.4989013671875, "completions/mean_terminated_length": 582.3627319335938, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 4.494460641399417, "grad_norm": 0.14671459794044495, "learning_rate": 1e-06, "loss": 0.0344, "num_tokens": 318382132.0, "reward": 0.6283482313156128, "reward_std": 0.1740809977054596, "rewards/simpleverify_reward/mean": 0.6283482313156128, "rewards/simpleverify_reward/std": 0.4835159480571747, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 692.3694458007812, "completions/mean_terminated_length": 646.1663208007812, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 4.503790087463557, "grad_norm": 0.13092166185379028, "learning_rate": 1e-06, "loss": 0.0236, "num_tokens": 319094735.0, "reward": 0.629464328289032, "reward_std": 0.13794127106666565, "rewards/simpleverify_reward/mean": 0.6294642686843872, "rewards/simpleverify_reward/std": 0.4832179844379425, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 601.1953125, "completions/mean_terminated_length": 581.5836181640625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 4.513119533527696, "grad_norm": 0.1335403174161911, "learning_rate": 1e-06, "loss": 0.0321, "num_tokens": 319713574.0, "reward": 0.6986607313156128, "reward_std": 0.14233148097991943, "rewards/simpleverify_reward/mean": 0.6986607313156128, "rewards/simpleverify_reward/std": 0.4590960144996643, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3884.0, "completions/mean_length": 669.1027221679688, "completions/mean_terminated_length": 638.229736328125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 4.522448979591837, "grad_norm": 0.13420091569423676, "learning_rate": 1e-06, "loss": 0.0239, "num_tokens": 320395386.0, "reward": 0.621651828289032, "reward_std": 0.16995178163051605, "rewards/simpleverify_reward/mean": 0.6216517686843872, "rewards/simpleverify_reward/std": 0.4852459728717804, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 664.958740234375, "completions/mean_terminated_length": 657.2830200195312, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 4.531778425655976, "grad_norm": 0.1282912790775299, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 321084069.0, "reward": 0.5758928656578064, "reward_std": 0.13218912482261658, "rewards/simpleverify_reward/mean": 0.5758928656578064, "rewards/simpleverify_reward/std": 0.49448275566101074, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3290.0, "completions/mean_length": 595.4129638671875, "completions/mean_terminated_length": 579.7152709960938, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 4.541107871720117, "grad_norm": 0.12625464797019958, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 321695911.0, "reward": 0.6741071939468384, "reward_std": 0.13015852868556976, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.46896928548812866, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3826.0, "completions/mean_length": 697.0725708007812, "completions/mean_terminated_length": 658.7099609375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 4.550437317784256, "grad_norm": 0.12999670207500458, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 322406496.0, "reward": 0.6752232313156128, "reward_std": 0.1439472883939743, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3173.0, "completions/mean_length": 589.7779541015625, "completions/mean_terminated_length": 577.9989013671875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 4.559766763848397, "grad_norm": 0.14309996366500854, "learning_rate": 1e-06, "loss": 0.0181, "num_tokens": 323020833.0, "reward": 0.746651828289032, "reward_std": 0.13519500195980072, "rewards/simpleverify_reward/mean": 0.7466517686843872, "rewards/simpleverify_reward/std": 0.435171514749527, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2637.0, "completions/mean_length": 674.911865234375, "completions/mean_terminated_length": 640.1995239257812, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 4.569096209912536, "grad_norm": 0.1348610669374466, "learning_rate": 1e-06, "loss": 0.0281, "num_tokens": 323718994.0, "reward": 0.637276828289032, "reward_std": 0.17754150927066803, "rewards/simpleverify_reward/mean": 0.6372767686843872, "rewards/simpleverify_reward/std": 0.481054425239563, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3977.0, "completions/mean_length": 711.6428833007812, "completions/mean_terminated_length": 665.701416015625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 4.578425655976677, "grad_norm": 0.14155526459217072, "learning_rate": 1e-06, "loss": 0.0281, "num_tokens": 324454178.0, "reward": 0.6026785969734192, "reward_std": 0.1929478943347931, "rewards/simpleverify_reward/mean": 0.6026785969734192, "rewards/simpleverify_reward/std": 0.48961687088012695, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2428.0, "completions/mean_length": 626.2533569335938, "completions/mean_terminated_length": 610.6939697265625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 4.587755102040816, "grad_norm": 0.15720123052597046, "learning_rate": 1e-06, "loss": 0.0321, "num_tokens": 325101141.0, "reward": 0.6830357313156128, "reward_std": 0.15680494904518127, "rewards/simpleverify_reward/mean": 0.6830357313156128, "rewards/simpleverify_reward/std": 0.46555325388908386, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 629.8817138671875, "completions/mean_terminated_length": 614.338623046875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 4.597084548104956, "grad_norm": 0.14239463210105896, "learning_rate": 1e-06, "loss": 0.0352, "num_tokens": 325753819.0, "reward": 0.6573660969734192, "reward_std": 0.18047252297401428, "rewards/simpleverify_reward/mean": 0.6573660969734192, "rewards/simpleverify_reward/std": 0.47485533356666565, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2086.0, "completions/mean_length": 623.1015625, "completions/mean_terminated_length": 591.814208984375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 4.606413994169096, "grad_norm": 0.14008277654647827, "learning_rate": 1e-06, "loss": 0.0207, "num_tokens": 326397910.0, "reward": 0.625, "reward_std": 0.1553688645362854, "rewards/simpleverify_reward/mean": 0.625, "rewards/simpleverify_reward/std": 0.48439329862594604, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2414.0, "completions/mean_length": 620.2533569335938, "completions/mean_terminated_length": 592.8853149414062, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 4.615743440233236, "grad_norm": 0.1633065640926361, "learning_rate": 1e-06, "loss": 0.0398, "num_tokens": 327033041.0, "reward": 0.6808035969734192, "reward_std": 0.1761571764945984, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.46642565727233887, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2251.0, "completions/mean_length": 611.1417846679688, "completions/mean_terminated_length": 587.6483154296875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 4.625072886297376, "grad_norm": 0.164765402674675, "learning_rate": 1e-06, "loss": 0.0282, "num_tokens": 327666256.0, "reward": 0.707589328289032, "reward_std": 0.1813080757856369, "rewards/simpleverify_reward/mean": 0.7075892686843872, "rewards/simpleverify_reward/std": 0.45512402057647705, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3533.0, "completions/mean_length": 678.8973388671875, "completions/mean_terminated_length": 640.32958984375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 4.634402332361516, "grad_norm": 0.12950468063354492, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 328366308.0, "reward": 0.59375, "reward_std": 0.17719705402851105, "rewards/simpleverify_reward/mean": 0.59375, "rewards/simpleverify_reward/std": 0.4914066195487976, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3626.0, "completions/mean_length": 599.9364013671875, "completions/mean_terminated_length": 580.317626953125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 4.643731778425656, "grad_norm": 0.14470253884792328, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 328986683.0, "reward": 0.6852678656578064, "reward_std": 0.15924488008022308, "rewards/simpleverify_reward/mean": 0.6852678656578064, "rewards/simpleverify_reward/std": 0.46466848254203796, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3233.0, "completions/mean_length": 710.7857666015625, "completions/mean_terminated_length": 660.94677734375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 4.653061224489796, "grad_norm": 0.1296951025724411, "learning_rate": 1e-06, "loss": 0.0481, "num_tokens": 329713083.0, "reward": 0.6339285969734192, "reward_std": 0.17577487230300903, "rewards/simpleverify_reward/mean": 0.6339285969734192, "rewards/simpleverify_reward/std": 0.48199835419654846, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3313.0, "completions/mean_length": 662.474365234375, "completions/mean_terminated_length": 611.9241333007812, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 4.662390670553936, "grad_norm": 0.1370389312505722, "learning_rate": 1e-06, "loss": 0.0464, "num_tokens": 330397692.0, "reward": 0.6863839626312256, "reward_std": 0.1586022675037384, "rewards/simpleverify_reward/mean": 0.6863839030265808, "rewards/simpleverify_reward/std": 0.46422144770622253, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3241.0, "completions/mean_length": 660.1495971679688, "completions/mean_terminated_length": 625.2874755859375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 4.671720116618076, "grad_norm": 0.15144044160842896, "learning_rate": 1e-06, "loss": 0.0293, "num_tokens": 331080330.0, "reward": 0.6339285969734192, "reward_std": 0.19355519115924835, "rewards/simpleverify_reward/mean": 0.6339285969734192, "rewards/simpleverify_reward/std": 0.48199835419654846, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3399.0, "completions/mean_length": 657.5, "completions/mean_terminated_length": 634.319091796875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 4.681049562682215, "grad_norm": 0.13537172973155975, "learning_rate": 1e-06, "loss": 0.0205, "num_tokens": 331764650.0, "reward": 0.6127232313156128, "reward_std": 0.14417731761932373, "rewards/simpleverify_reward/mean": 0.6127232313156128, "rewards/simpleverify_reward/std": 0.4873998463153839, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2467.0, "completions/mean_length": 645.8828125, "completions/mean_terminated_length": 626.5219116210938, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 4.690379008746356, "grad_norm": 0.15210549533367157, "learning_rate": 1e-06, "loss": 0.0205, "num_tokens": 332441937.0, "reward": 0.6026785969734192, "reward_std": 0.1970798820257187, "rewards/simpleverify_reward/mean": 0.6026785969734192, "rewards/simpleverify_reward/std": 0.48961687088012695, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.022321428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3404.0, "completions/mean_length": 719.4844360351562, "completions/mean_terminated_length": 642.3949584960938, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 4.699708454810495, "grad_norm": 0.13595300912857056, "learning_rate": 1e-06, "loss": 0.0353, "num_tokens": 333180875.0, "reward": 0.6205357313156128, "reward_std": 0.19215510785579681, "rewards/simpleverify_reward/mean": 0.6205357313156128, "rewards/simpleverify_reward/std": 0.4855247139930725, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3708.0, "completions/mean_length": 703.2991333007812, "completions/mean_terminated_length": 684.2604370117188, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 4.709037900874636, "grad_norm": 0.13761140406131744, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 333902623.0, "reward": 0.6361607313156128, "reward_std": 0.15984967350959778, "rewards/simpleverify_reward/mean": 0.6361607313156128, "rewards/simpleverify_reward/std": 0.4813718795776367, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2455.0, "completions/mean_length": 673.1707763671875, "completions/mean_terminated_length": 638.4407958984375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 4.718367346938775, "grad_norm": 0.12781095504760742, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 334600552.0, "reward": 0.613839328289032, "reward_std": 0.15855950117111206, "rewards/simpleverify_reward/mean": 0.6138392686843872, "rewards/simpleverify_reward/std": 0.48714008927345276, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 708.9241333007812, "completions/mean_terminated_length": 666.8248901367188, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 4.727696793002916, "grad_norm": 0.13325197994709015, "learning_rate": 1e-06, "loss": 0.0362, "num_tokens": 335325260.0, "reward": 0.5959821939468384, "reward_std": 0.17585155367851257, "rewards/simpleverify_reward/mean": 0.5959821343421936, "rewards/simpleverify_reward/std": 0.490975022315979, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2856.0, "completions/mean_length": 596.5189819335938, "completions/mean_terminated_length": 580.8262329101562, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 4.737026239067055, "grad_norm": 0.1491280347108841, "learning_rate": 1e-06, "loss": 0.0249, "num_tokens": 335957853.0, "reward": 0.6651785969734192, "reward_std": 0.1581925004720688, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219157218933105, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2488.0, "completions/mean_length": 702.9542846679688, "completions/mean_terminated_length": 672.3862915039062, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 4.746355685131196, "grad_norm": 0.13787421584129333, "learning_rate": 1e-06, "loss": 0.0237, "num_tokens": 336682412.0, "reward": 0.566964328289032, "reward_std": 0.19396595656871796, "rewards/simpleverify_reward/mean": 0.5669642686843872, "rewards/simpleverify_reward/std": 0.49577224254608154, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3846.0, "completions/mean_length": 635.0535888671875, "completions/mean_terminated_length": 615.6318969726562, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 4.755685131195335, "grad_norm": 0.1455698162317276, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 337345044.0, "reward": 0.6473214626312256, "reward_std": 0.1634521782398224, "rewards/simpleverify_reward/mean": 0.6473214030265808, "rewards/simpleverify_reward/std": 0.47807058691978455, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3709.0, "completions/mean_length": 669.5301513671875, "completions/mean_terminated_length": 638.6610717773438, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 4.765014577259475, "grad_norm": 0.13055679202079773, "learning_rate": 1e-06, "loss": 0.0369, "num_tokens": 338029543.0, "reward": 0.7366071939468384, "reward_std": 0.15090197324752808, "rewards/simpleverify_reward/mean": 0.7366071343421936, "rewards/simpleverify_reward/std": 0.44071969389915466, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2547.0, "completions/mean_length": 688.2288208007812, "completions/mean_terminated_length": 665.2550659179688, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 4.774344023323615, "grad_norm": 0.14087484776973724, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 338749180.0, "reward": 0.6428571939468384, "reward_std": 0.19234946370124817, "rewards/simpleverify_reward/mean": 0.6428571343421936, "rewards/simpleverify_reward/std": 0.4794250428676605, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3323.0, "completions/mean_length": 577.4520263671875, "completions/mean_terminated_length": 569.58056640625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 4.783673469387755, "grad_norm": 0.14671212434768677, "learning_rate": 1e-06, "loss": 0.0277, "num_tokens": 339351625.0, "reward": 0.65625, "reward_std": 0.14147454500198364, "rewards/simpleverify_reward/mean": 0.65625, "rewards/simpleverify_reward/std": 0.4752241373062134, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 645.7199096679688, "completions/mean_terminated_length": 618.5523071289062, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 4.793002915451895, "grad_norm": 0.1405312716960907, "learning_rate": 1e-06, "loss": 0.0262, "num_tokens": 340022502.0, "reward": 0.6049107313156128, "reward_std": 0.16281278431415558, "rewards/simpleverify_reward/mean": 0.6049107313156128, "rewards/simpleverify_reward/std": 0.48914292454719543, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 675.6819458007812, "completions/mean_terminated_length": 617.447265625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 4.802332361516035, "grad_norm": 0.1476939171552658, "learning_rate": 1e-06, "loss": 0.0455, "num_tokens": 340708801.0, "reward": 0.6484375, "reward_std": 0.18667609989643097, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2726.0, "completions/mean_length": 684.3225708007812, "completions/mean_terminated_length": 657.458984375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 4.811661807580175, "grad_norm": 0.1389569789171219, "learning_rate": 1e-06, "loss": 0.0354, "num_tokens": 341418730.0, "reward": 0.6205357313156128, "reward_std": 0.1878686398267746, "rewards/simpleverify_reward/mean": 0.6205357313156128, "rewards/simpleverify_reward/std": 0.4855247139930725, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 4011.0, "completions/mean_length": 661.8125, "completions/mean_terminated_length": 650.2755126953125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 4.820991253644315, "grad_norm": 0.1486596167087555, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 342105810.0, "reward": 0.5837053656578064, "reward_std": 0.19347921013832092, "rewards/simpleverify_reward/mean": 0.5837053656578064, "rewards/simpleverify_reward/std": 0.49321895837783813, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3986.0, "completions/mean_length": 674.4542846679688, "completions/mean_terminated_length": 651.3876342773438, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 4.830320699708455, "grad_norm": 0.14460867643356323, "learning_rate": 1e-06, "loss": 0.0261, "num_tokens": 342798993.0, "reward": 0.6015625, "reward_std": 0.16976520419120789, "rewards/simpleverify_reward/mean": 0.6015625, "rewards/simpleverify_reward/std": 0.48984986543655396, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3183.0, "completions/mean_length": 700.1707763671875, "completions/mean_terminated_length": 650.175537109375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 4.839650145772595, "grad_norm": 0.13290877640247345, "learning_rate": 1e-06, "loss": 0.0314, "num_tokens": 343508658.0, "reward": 0.6127232313156128, "reward_std": 0.14432819187641144, "rewards/simpleverify_reward/mean": 0.6127232313156128, "rewards/simpleverify_reward/std": 0.4873998463153839, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3995.0, "completions/mean_length": 692.669677734375, "completions/mean_terminated_length": 662.009033203125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 4.848979591836734, "grad_norm": 0.13732337951660156, "learning_rate": 1e-06, "loss": 0.0124, "num_tokens": 344212394.0, "reward": 0.6350446939468384, "reward_std": 0.156542107462883, "rewards/simpleverify_reward/mean": 0.6350446343421936, "rewards/simpleverify_reward/std": 0.481686532497406, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3877.0, "completions/mean_length": 690.2310791015625, "completions/mean_terminated_length": 655.6741333007812, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 4.858309037900875, "grad_norm": 0.13731598854064941, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 344918673.0, "reward": 0.574776828289032, "reward_std": 0.17281357944011688, "rewards/simpleverify_reward/mean": 0.5747767686843872, "rewards/simpleverify_reward/std": 0.49465295672416687, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2158.0, "completions/mean_length": 637.4799194335938, "completions/mean_terminated_length": 618.0718383789062, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 4.867638483965014, "grad_norm": 0.13589094579219818, "learning_rate": 1e-06, "loss": 0.034, "num_tokens": 345582743.0, "reward": 0.6540178656578064, "reward_std": 0.16341009736061096, "rewards/simpleverify_reward/mean": 0.6540178656578064, "rewards/simpleverify_reward/std": 0.4759531021118164, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3156.0, "completions/mean_length": 603.9967041015625, "completions/mean_terminated_length": 576.5006103515625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 4.876967930029155, "grad_norm": 0.15138036012649536, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 346208892.0, "reward": 0.6953125596046448, "reward_std": 0.14628097414970398, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3423.0, "completions/mean_length": 635.0089721679688, "completions/mean_terminated_length": 631.1419067382812, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 4.886297376093294, "grad_norm": 0.15283583104610443, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 346874260.0, "reward": 0.590401828289032, "reward_std": 0.1970803141593933, "rewards/simpleverify_reward/mean": 0.5904017686843872, "rewards/simpleverify_reward/std": 0.49203425645828247, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3526.0, "completions/mean_length": 632.7611694335938, "completions/mean_terminated_length": 589.7152709960938, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 4.895626822157435, "grad_norm": 0.13007193803787231, "learning_rate": 1e-06, "loss": 0.0445, "num_tokens": 347532518.0, "reward": 0.645089328289032, "reward_std": 0.14951439201831818, "rewards/simpleverify_reward/mean": 0.6450892686843872, "rewards/simpleverify_reward/std": 0.4787535071372986, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 4056.0, "completions/mean_length": 710.5267944335938, "completions/mean_terminated_length": 702.9530029296875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 4.904956268221574, "grad_norm": 0.1379213035106659, "learning_rate": 1e-06, "loss": 0.0322, "num_tokens": 348265326.0, "reward": 0.566964328289032, "reward_std": 0.2028701901435852, "rewards/simpleverify_reward/mean": 0.5669642686843872, "rewards/simpleverify_reward/std": 0.49577224254608154, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3952.0, "completions/mean_length": 712.060302734375, "completions/mean_terminated_length": 673.8668212890625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 4.914285714285715, "grad_norm": 0.13664761185646057, "learning_rate": 1e-06, "loss": 0.0312, "num_tokens": 348995148.0, "reward": 0.6194196939468384, "reward_std": 0.17400822043418884, "rewards/simpleverify_reward/mean": 0.6194196343421936, "rewards/simpleverify_reward/std": 0.48580074310302734, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3293.0, "completions/mean_length": 637.1328125, "completions/mean_terminated_length": 598.0936889648438, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 4.923615160349854, "grad_norm": 0.13823190331459045, "learning_rate": 1e-06, "loss": 0.0222, "num_tokens": 349655483.0, "reward": 0.609375, "reward_std": 0.1643179953098297, "rewards/simpleverify_reward/mean": 0.609375, "rewards/simpleverify_reward/std": 0.48816296458244324, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2787.0, "completions/mean_length": 667.4642944335938, "completions/mean_terminated_length": 652.0897216796875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 4.932944606413994, "grad_norm": 0.14011141657829285, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 350342563.0, "reward": 0.6551339626312256, "reward_std": 0.17724235355854034, "rewards/simpleverify_reward/mean": 0.6551339030265808, "rewards/simpleverify_reward/std": 0.4755900502204895, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 622.474365234375, "completions/mean_terminated_length": 599.0573120117188, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 4.942274052478134, "grad_norm": 0.15328559279441833, "learning_rate": 1e-06, "loss": 0.0196, "num_tokens": 350980412.0, "reward": 0.6651785969734192, "reward_std": 0.1820116490125656, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219160199165344, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3666.0, "completions/mean_length": 656.5245971679688, "completions/mean_terminated_length": 648.8299560546875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 4.9516034985422746, "grad_norm": 0.14975692331790924, "learning_rate": 1e-06, "loss": 0.0348, "num_tokens": 351658466.0, "reward": 0.6674107313156128, "reward_std": 0.16326671838760376, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3594.0, "completions/mean_length": 618.3850708007812, "completions/mean_terminated_length": 587.05517578125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 4.960932944606414, "grad_norm": 0.12356135249137878, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 352296787.0, "reward": 0.7254464626312256, "reward_std": 0.12151501327753067, "rewards/simpleverify_reward/mean": 0.7254464030265808, "rewards/simpleverify_reward/std": 0.4465382993221283, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3538.0, "completions/mean_length": 674.505615234375, "completions/mean_terminated_length": 647.564697265625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 4.970262390670554, "grad_norm": 0.1554206907749176, "learning_rate": 1e-06, "loss": 0.0381, "num_tokens": 352980832.0, "reward": 0.6752232313156128, "reward_std": 0.20478133857250214, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 636.4308471679688, "completions/mean_terminated_length": 620.9170532226562, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 4.979591836734694, "grad_norm": 0.13316890597343445, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 353643778.0, "reward": 0.6283482313156128, "reward_std": 0.15631206333637238, "rewards/simpleverify_reward/mean": 0.6283482313156128, "rewards/simpleverify_reward/std": 0.4835159480571747, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3714.0, "completions/mean_length": 648.4855346679688, "completions/mean_terminated_length": 625.2438354492188, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 4.988921282798834, "grad_norm": 0.14230412244796753, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 354317029.0, "reward": 0.590401828289032, "reward_std": 0.16315613687038422, "rewards/simpleverify_reward/mean": 0.5904017686843872, "rewards/simpleverify_reward/std": 0.49203425645828247, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011363636363636354, "completions/max_length": 4096.0, "completions/max_terminated_length": 2200.0, "completions/mean_length": 608.9091186523438, "completions/mean_terminated_length": 568.8275756835938, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 4.998250728862974, "grad_norm": 0.1377047598361969, "learning_rate": 1e-06, "loss": 0.0162, "num_tokens": 354993615.0, "reward": 0.6183035969734192, "reward_std": 0.15643151104450226, "rewards/simpleverify_reward/mean": 0.6183035969734192, "rewards/simpleverify_reward/std": 0.4860740303993225, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3251.0, "completions/mean_length": 642.0, "completions/mean_terminated_length": 618.714599609375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 5.0093294460641395, "grad_norm": 0.12727832794189453, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 355646439.0, "reward": 0.6897321939468384, "reward_std": 0.14162679016590118, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2924.0, "completions/mean_length": 667.185302734375, "completions/mean_terminated_length": 640.186767578125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 5.01865889212828, "grad_norm": 0.15376044809818268, "learning_rate": 1e-06, "loss": 0.0393, "num_tokens": 356325957.0, "reward": 0.5837053656578064, "reward_std": 0.19884906709194183, "rewards/simpleverify_reward/mean": 0.5837053656578064, "rewards/simpleverify_reward/std": 0.49321892857551575, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0200892857142857, "completions/max_length": 4096.0, "completions/max_terminated_length": 3578.0, "completions/mean_length": 754.8973388671875, "completions/mean_terminated_length": 686.4009399414062, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 5.0279883381924195, "grad_norm": 0.14517363905906677, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 357092705.0, "reward": 0.5892857313156128, "reward_std": 0.16706354916095734, "rewards/simpleverify_reward/mean": 0.5892857313156128, "rewards/simpleverify_reward/std": 0.49223825335502625, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4006.0, "completions/mean_length": 634.3783569335938, "completions/mean_terminated_length": 607.1215209960938, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 5.03731778425656, "grad_norm": 0.1381053775548935, "learning_rate": 1e-06, "loss": 0.0299, "num_tokens": 357753716.0, "reward": 0.65625, "reward_std": 0.144252210855484, "rewards/simpleverify_reward/mean": 0.65625, "rewards/simpleverify_reward/std": 0.4752241373062134, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 637.296875, "completions/mean_terminated_length": 621.7870483398438, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 5.0466472303206995, "grad_norm": 0.1443057805299759, "learning_rate": 1e-06, "loss": 0.0236, "num_tokens": 358407846.0, "reward": 0.7053571939468384, "reward_std": 0.15616440773010254, "rewards/simpleverify_reward/mean": 0.7053571343421936, "rewards/simpleverify_reward/std": 0.45613667368888855, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3485.0, "completions/mean_length": 617.5870971679688, "completions/mean_terminated_length": 605.9014892578125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 5.05597667638484, "grad_norm": 0.13979914784431458, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 359044132.0, "reward": 0.6707589626312256, "reward_std": 0.15634779632091522, "rewards/simpleverify_reward/mean": 0.6707589030265808, "rewards/simpleverify_reward/std": 0.4702001214027405, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3726.0, "completions/mean_length": 626.1986694335938, "completions/mean_terminated_length": 614.5419921875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 5.0653061224489795, "grad_norm": 0.15431521832942963, "learning_rate": 1e-06, "loss": 0.0253, "num_tokens": 359687742.0, "reward": 0.6395089626312256, "reward_std": 0.17367304861545563, "rewards/simpleverify_reward/mean": 0.6395089030265808, "rewards/simpleverify_reward/std": 0.4804111123085022, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2455.0, "completions/mean_length": 618.8527221679688, "completions/mean_terminated_length": 591.4735717773438, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 5.07463556851312, "grad_norm": 0.14621497690677643, "learning_rate": 1e-06, "loss": 0.0293, "num_tokens": 360329682.0, "reward": 0.6584821939468384, "reward_std": 0.14207187294960022, "rewards/simpleverify_reward/mean": 0.6584821343421936, "rewards/simpleverify_reward/std": 0.4744836091995239, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3499.0, "completions/mean_length": 625.9185791015625, "completions/mean_terminated_length": 610.357666015625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 5.0839650145772595, "grad_norm": 0.11382981389760971, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 360984009.0, "reward": 0.6495535969734192, "reward_std": 0.12741801142692566, "rewards/simpleverify_reward/mean": 0.6495535969734192, "rewards/simpleverify_reward/std": 0.477376252412796, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 655.1517944335938, "completions/mean_terminated_length": 635.8428955078125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 5.093294460641399, "grad_norm": 0.1321692317724228, "learning_rate": 1e-06, "loss": 0.0208, "num_tokens": 361670281.0, "reward": 0.5602678656578064, "reward_std": 0.1506730169057846, "rewards/simpleverify_reward/mean": 0.5602678656578064, "rewards/simpleverify_reward/std": 0.4966317117214203, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3858.0, "completions/mean_length": 649.6339721679688, "completions/mean_terminated_length": 582.9806518554688, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 5.1026239067055394, "grad_norm": 0.1351502388715744, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 362335321.0, "reward": 0.6774553656578064, "reward_std": 0.15349486470222473, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3238.0, "completions/mean_length": 599.5960083007812, "completions/mean_terminated_length": 591.7740478515625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 5.111953352769679, "grad_norm": 0.1279367357492447, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 362962575.0, "reward": 0.7008928656578064, "reward_std": 0.1245201975107193, "rewards/simpleverify_reward/mean": 0.7008928656578064, "rewards/simpleverify_reward/std": 0.4581226110458374, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 606.7444458007812, "completions/mean_terminated_length": 598.9384765625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 5.121282798833819, "grad_norm": 0.132477805018425, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 363588186.0, "reward": 0.7287946939468384, "reward_std": 0.15367889404296875, "rewards/simpleverify_reward/mean": 0.7287946343421936, "rewards/simpleverify_reward/std": 0.44483017921447754, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 641.6127319335938, "completions/mean_terminated_length": 637.7530517578125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 5.130612244897959, "grad_norm": 0.12332062423229218, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 364254063.0, "reward": 0.6674107313156128, "reward_std": 0.12903310358524323, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2862.0, "completions/mean_length": 656.0725708007812, "completions/mean_terminated_length": 621.1690673828125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 5.139941690962099, "grad_norm": 0.1380327343940735, "learning_rate": 1e-06, "loss": 0.0258, "num_tokens": 364941160.0, "reward": 0.5390625, "reward_std": 0.15213796496391296, "rewards/simpleverify_reward/mean": 0.5390625, "rewards/simpleverify_reward/std": 0.4987502098083496, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2846.0, "completions/mean_length": 603.3717041015625, "completions/mean_terminated_length": 587.7096557617188, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 5.149271137026239, "grad_norm": 0.14188458025455475, "learning_rate": 1e-06, "loss": 0.0176, "num_tokens": 365570117.0, "reward": 0.6741071939468384, "reward_std": 0.13598594069480896, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2526.0, "completions/mean_length": 674.3147583007812, "completions/mean_terminated_length": 651.2471923828125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 5.158600583090379, "grad_norm": 0.1423128992319107, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 366261583.0, "reward": 0.6752232313156128, "reward_std": 0.18362854421138763, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3508.0, "completions/mean_length": 620.4799194335938, "completions/mean_terminated_length": 597.0494384765625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 5.167930029154519, "grad_norm": 0.14872750639915466, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 366903757.0, "reward": 0.660714328289032, "reward_std": 0.163757786154747, "rewards/simpleverify_reward/mean": 0.6607142686843872, "rewards/simpleverify_reward/std": 0.4737313687801361, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3034.0, "completions/mean_length": 662.25, "completions/mean_terminated_length": 654.5682373046875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 5.1772594752186585, "grad_norm": 0.14616534113883972, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 367584549.0, "reward": 0.6227678656578064, "reward_std": 0.19230736792087555, "rewards/simpleverify_reward/mean": 0.6227678656578064, "rewards/simpleverify_reward/std": 0.4849644899368286, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 654.7210083007812, "completions/mean_terminated_length": 647.0223999023438, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 5.186588921282799, "grad_norm": 0.13003265857696533, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 368257083.0, "reward": 0.660714328289032, "reward_std": 0.16810593008995056, "rewards/simpleverify_reward/mean": 0.6607142686843872, "rewards/simpleverify_reward/std": 0.4737313687801361, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2722.0, "completions/mean_length": 625.6328125, "completions/mean_terminated_length": 602.237060546875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 5.1959183673469385, "grad_norm": 0.15425829589366913, "learning_rate": 1e-06, "loss": 0.037, "num_tokens": 368908762.0, "reward": 0.6741071939468384, "reward_std": 0.19294676184654236, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3169.0, "completions/mean_length": 696.4520263671875, "completions/mean_terminated_length": 646.4020385742188, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 5.205247813411079, "grad_norm": 0.13435529172420502, "learning_rate": 1e-06, "loss": 0.0327, "num_tokens": 369621751.0, "reward": 0.6261160969734192, "reward_std": 0.15698716044425964, "rewards/simpleverify_reward/mean": 0.6261160969734192, "rewards/simpleverify_reward/std": 0.48410359025001526, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3778.0, "completions/mean_length": 620.8471069335938, "completions/mean_terminated_length": 605.2634887695312, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 5.214577259475218, "grad_norm": 0.14248208701610565, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 370273558.0, "reward": 0.652901828289032, "reward_std": 0.16322535276412964, "rewards/simpleverify_reward/mean": 0.6529017686843872, "rewards/simpleverify_reward/std": 0.47631320357322693, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2311.0, "completions/mean_length": 632.396240234375, "completions/mean_terminated_length": 616.8643798828125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 5.223906705539359, "grad_norm": 0.14479613304138184, "learning_rate": 1e-06, "loss": 0.0318, "num_tokens": 370925497.0, "reward": 0.6808035969734192, "reward_std": 0.19824105501174927, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.4664256274700165, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 624.552490234375, "completions/mean_terminated_length": 597.21826171875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 5.233236151603498, "grad_norm": 0.15350471436977386, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 371574816.0, "reward": 0.5758928656578064, "reward_std": 0.17048171162605286, "rewards/simpleverify_reward/mean": 0.5758928656578064, "rewards/simpleverify_reward/std": 0.49448272585868835, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 647.9910888671875, "completions/mean_terminated_length": 616.9279174804688, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 5.242565597667639, "grad_norm": 0.1378231644630432, "learning_rate": 1e-06, "loss": 0.0155, "num_tokens": 372246544.0, "reward": 0.684151828289032, "reward_std": 0.13655048608779907, "rewards/simpleverify_reward/mean": 0.6841517686843872, "rewards/simpleverify_reward/std": 0.4651124179363251, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 612.5223388671875, "completions/mean_terminated_length": 608.630126953125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 5.251895043731778, "grad_norm": 0.15396837890148163, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 372880684.0, "reward": 0.6595982313156128, "reward_std": 0.1584182232618332, "rewards/simpleverify_reward/mean": 0.6595982313156128, "rewards/simpleverify_reward/std": 0.4741089344024658, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3363.0, "completions/mean_length": 649.6585083007812, "completions/mean_terminated_length": 610.7607421875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 5.261224489795918, "grad_norm": 0.15521016716957092, "learning_rate": 1e-06, "loss": 0.0283, "num_tokens": 373548050.0, "reward": 0.6339285969734192, "reward_std": 0.1925349086523056, "rewards/simpleverify_reward/mean": 0.6339285969734192, "rewards/simpleverify_reward/std": 0.48199835419654846, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 716.2455444335938, "completions/mean_terminated_length": 681.95263671875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 5.270553935860058, "grad_norm": 0.13975757360458374, "learning_rate": 1e-06, "loss": 0.0332, "num_tokens": 374289454.0, "reward": 0.5892857313156128, "reward_std": 0.20542281866073608, "rewards/simpleverify_reward/mean": 0.5892857313156128, "rewards/simpleverify_reward/std": 0.49223825335502625, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 600.872802734375, "completions/mean_terminated_length": 593.0537109375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 5.279883381924198, "grad_norm": 0.14441782236099243, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 374918844.0, "reward": 0.6339285969734192, "reward_std": 0.1740816980600357, "rewards/simpleverify_reward/mean": 0.6339285969734192, "rewards/simpleverify_reward/std": 0.48199835419654846, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3926.0, "completions/mean_length": 614.8170166015625, "completions/mean_terminated_length": 603.1220703125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 5.289212827988338, "grad_norm": 0.131760373711586, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 375555624.0, "reward": 0.6718750596046448, "reward_std": 0.1450745314359665, "rewards/simpleverify_reward/mean": 0.671875, "rewards/simpleverify_reward/std": 0.46979284286499023, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3785.0, "completions/mean_length": 641.5402221679688, "completions/mean_terminated_length": 614.3397216796875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 5.298542274052478, "grad_norm": 0.14535890519618988, "learning_rate": 1e-06, "loss": 0.0226, "num_tokens": 376215908.0, "reward": 0.6540178656578064, "reward_std": 0.17769558727741241, "rewards/simpleverify_reward/mean": 0.6540178656578064, "rewards/simpleverify_reward/std": 0.4759531021118164, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 623.1596069335938, "completions/mean_terminated_length": 611.4927368164062, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 5.307871720116618, "grad_norm": 0.1461706906557083, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 376861019.0, "reward": 0.6272321939468384, "reward_std": 0.1640583574771881, "rewards/simpleverify_reward/mean": 0.6272321343421936, "rewards/simpleverify_reward/std": 0.4838111698627472, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3899.0, "completions/mean_length": 658.872802734375, "completions/mean_terminated_length": 627.9076538085938, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 5.317201166180758, "grad_norm": 0.11526299268007278, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 377543625.0, "reward": 0.6283482313156128, "reward_std": 0.13331636786460876, "rewards/simpleverify_reward/mean": 0.6283482313156128, "rewards/simpleverify_reward/std": 0.4835159182548523, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3751.0, "completions/mean_length": 628.6171875, "completions/mean_terminated_length": 609.159423828125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 5.326530612244898, "grad_norm": 0.1368526816368103, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 378191834.0, "reward": 0.6272321939468384, "reward_std": 0.1566847860813141, "rewards/simpleverify_reward/mean": 0.6272321343421936, "rewards/simpleverify_reward/std": 0.4838111698627472, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 629.3861694335938, "completions/mean_terminated_length": 617.740234375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 5.335860058309038, "grad_norm": 0.14039236307144165, "learning_rate": 1e-06, "loss": 0.009, "num_tokens": 378837988.0, "reward": 0.6819196939468384, "reward_std": 0.155143141746521, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3818.0, "completions/mean_length": 608.4866333007812, "completions/mean_terminated_length": 596.7704467773438, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 5.345189504373177, "grad_norm": 0.14452677965164185, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 379475880.0, "reward": 0.606026828289032, "reward_std": 0.14196380972862244, "rewards/simpleverify_reward/mean": 0.6060267686843872, "rewards/simpleverify_reward/std": 0.48890191316604614, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3468.0, "completions/mean_length": 626.2756958007812, "completions/mean_terminated_length": 602.88427734375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 5.354518950437318, "grad_norm": 0.14156703650951385, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 380131599.0, "reward": 0.6808035969734192, "reward_std": 0.1487216353416443, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.4664256274700165, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 555.984375, "completions/mean_terminated_length": 536.1190185546875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 5.363848396501457, "grad_norm": 0.16111868619918823, "learning_rate": 1e-06, "loss": 0.026, "num_tokens": 380720313.0, "reward": 0.6383928656578064, "reward_std": 0.16645735502243042, "rewards/simpleverify_reward/mean": 0.6383928656578064, "rewards/simpleverify_reward/std": 0.4807341992855072, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3131.0, "completions/mean_length": 696.7767944335938, "completions/mean_terminated_length": 654.5265502929688, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 5.373177842565598, "grad_norm": 0.13886457681655884, "learning_rate": 1e-06, "loss": 0.0468, "num_tokens": 381437105.0, "reward": 0.6227678656578064, "reward_std": 0.19276131689548492, "rewards/simpleverify_reward/mean": 0.6227678656578064, "rewards/simpleverify_reward/std": 0.4849644601345062, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 609.7277221679688, "completions/mean_terminated_length": 601.9284057617188, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 5.382507288629737, "grad_norm": 0.1362166404724121, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 382079965.0, "reward": 0.6584821939468384, "reward_std": 0.14019575715065002, "rewards/simpleverify_reward/mean": 0.6584821343421936, "rewards/simpleverify_reward/std": 0.4744836091995239, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3845.0, "completions/mean_length": 656.0502319335938, "completions/mean_terminated_length": 640.6244506835938, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 5.391836734693878, "grad_norm": 0.13248755037784576, "learning_rate": 1e-06, "loss": 0.0211, "num_tokens": 382754362.0, "reward": 0.6964285969734192, "reward_std": 0.1133689284324646, "rewards/simpleverify_reward/mean": 0.6964285969734192, "rewards/simpleverify_reward/std": 0.4600565731525421, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3974.0, "completions/mean_length": 643.0892944335938, "completions/mean_terminated_length": 608.0540771484375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 5.401166180758017, "grad_norm": 0.11656410247087479, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 383426194.0, "reward": 0.6484375, "reward_std": 0.12734061479568481, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2422.0, "completions/mean_length": 682.7957763671875, "completions/mean_terminated_length": 648.1634521484375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 5.410495626822158, "grad_norm": 0.1365356147289276, "learning_rate": 1e-06, "loss": 0.0337, "num_tokens": 384132019.0, "reward": 0.606026828289032, "reward_std": 0.16521881520748138, "rewards/simpleverify_reward/mean": 0.6060267686843872, "rewards/simpleverify_reward/std": 0.48890194296836853, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 625.8973388671875, "completions/mean_terminated_length": 606.4242553710938, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 5.419825072886297, "grad_norm": 0.1357126235961914, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 384778559.0, "reward": 0.6651785969734192, "reward_std": 0.14699524641036987, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219157218933105, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4078.0, "completions/mean_length": 614.1741333007812, "completions/mean_terminated_length": 586.7581787109375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 5.429154518950437, "grad_norm": 0.1162579283118248, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 385415083.0, "reward": 0.6473214626312256, "reward_std": 0.11881405115127563, "rewards/simpleverify_reward/mean": 0.6473214030265808, "rewards/simpleverify_reward/std": 0.47807058691978455, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2228.0, "completions/mean_length": 633.4576416015625, "completions/mean_terminated_length": 617.9305419921875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 5.438483965014577, "grad_norm": 0.12881925702095032, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 386064189.0, "reward": 0.6785714626312256, "reward_std": 0.1274600774049759, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3605.0, "completions/mean_length": 599.583740234375, "completions/mean_terminated_length": 583.9047241210938, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 5.447813411078717, "grad_norm": 0.14524903893470764, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 386684304.0, "reward": 0.6785714626312256, "reward_std": 0.14913460612297058, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2458.0, "completions/mean_length": 592.1685791015625, "completions/mean_terminated_length": 588.2536010742188, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 5.457142857142857, "grad_norm": 0.14298319816589355, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 387299527.0, "reward": 0.6551339626312256, "reward_std": 0.158342644572258, "rewards/simpleverify_reward/mean": 0.6551339030265808, "rewards/simpleverify_reward/std": 0.4755900502204895, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 646.4967041015625, "completions/mean_terminated_length": 631.028076171875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 5.466472303206997, "grad_norm": 0.11875557899475098, "learning_rate": 1e-06, "loss": 0.0219, "num_tokens": 387975756.0, "reward": 0.6339285969734192, "reward_std": 0.12065695226192474, "rewards/simpleverify_reward/mean": 0.6339285969734192, "rewards/simpleverify_reward/std": 0.48199838399887085, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3750.0, "completions/mean_length": 614.0324096679688, "completions/mean_terminated_length": 594.4927368164062, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 5.475801749271137, "grad_norm": 0.15814462304115295, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 388613665.0, "reward": 0.65625, "reward_std": 0.1842675358057022, "rewards/simpleverify_reward/mean": 0.65625, "rewards/simpleverify_reward/std": 0.4752241373062134, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2365.0, "completions/mean_length": 653.029052734375, "completions/mean_terminated_length": 641.4625244140625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 5.485131195335277, "grad_norm": 0.14208568632602692, "learning_rate": 1e-06, "loss": 0.0307, "num_tokens": 389297395.0, "reward": 0.5535714626312256, "reward_std": 0.15718220174312592, "rewards/simpleverify_reward/mean": 0.5535714030265808, "rewards/simpleverify_reward/std": 0.4973994791507721, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3766.0, "completions/mean_length": 686.5413208007812, "completions/mean_terminated_length": 651.9469604492188, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 5.494460641399417, "grad_norm": 0.12882056832313538, "learning_rate": 1e-06, "loss": 0.0121, "num_tokens": 390010208.0, "reward": 0.6127232313156128, "reward_std": 0.15601149201393127, "rewards/simpleverify_reward/mean": 0.6127232313156128, "rewards/simpleverify_reward/std": 0.4873998463153839, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1844.0, "completions/mean_length": 616.1674194335938, "completions/mean_terminated_length": 596.6397705078125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 5.503790087463557, "grad_norm": 0.15649403631687164, "learning_rate": 1e-06, "loss": 0.0206, "num_tokens": 390647950.0, "reward": 0.6707589626312256, "reward_std": 0.17333422601222992, "rewards/simpleverify_reward/mean": 0.6707589030265808, "rewards/simpleverify_reward/std": 0.4702001214027405, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 568.8092041015625, "completions/mean_terminated_length": 556.959716796875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 5.513119533527696, "grad_norm": 0.15527121722698212, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 391241323.0, "reward": 0.684151828289032, "reward_std": 0.1663813591003418, "rewards/simpleverify_reward/mean": 0.6841517686843872, "rewards/simpleverify_reward/std": 0.4651124179363251, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3360.0, "completions/mean_length": 656.5513916015625, "completions/mean_terminated_length": 625.5653076171875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 5.522448979591837, "grad_norm": 0.13979987800121307, "learning_rate": 1e-06, "loss": 0.0169, "num_tokens": 391925673.0, "reward": 0.6350446939468384, "reward_std": 0.14165930449962616, "rewards/simpleverify_reward/mean": 0.6350446343421936, "rewards/simpleverify_reward/std": 0.481686532497406, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2448.0, "completions/mean_length": 617.5558471679688, "completions/mean_terminated_length": 594.1056518554688, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 5.531778425655976, "grad_norm": 0.13854989409446716, "learning_rate": 1e-06, "loss": 0.0257, "num_tokens": 392580235.0, "reward": 0.6551339626312256, "reward_std": 0.13783179223537445, "rewards/simpleverify_reward/mean": 0.6551339030265808, "rewards/simpleverify_reward/std": 0.4755900502204895, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2658.0, "completions/mean_length": 600.1875, "completions/mean_terminated_length": 584.51123046875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 5.541107871720117, "grad_norm": 0.15290294587612152, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 393214323.0, "reward": 0.6785714626312256, "reward_std": 0.15139050781726837, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 694.7767944335938, "completions/mean_terminated_length": 671.84716796875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 5.550437317784256, "grad_norm": 0.14279164373874664, "learning_rate": 1e-06, "loss": 0.0239, "num_tokens": 393921419.0, "reward": 0.637276828289032, "reward_std": 0.17888812720775604, "rewards/simpleverify_reward/mean": 0.6372767686843872, "rewards/simpleverify_reward/std": 0.481054425239563, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3075.0, "completions/mean_length": 642.6317138671875, "completions/mean_terminated_length": 627.145751953125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 5.559766763848397, "grad_norm": 0.16115424036979675, "learning_rate": 1e-06, "loss": 0.0121, "num_tokens": 394604257.0, "reward": 0.5881696939468384, "reward_std": 0.19926205277442932, "rewards/simpleverify_reward/mean": 0.5881696343421936, "rewards/simpleverify_reward/std": 0.4924396276473999, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2656.0, "completions/mean_length": 639.6060791015625, "completions/mean_terminated_length": 616.3045043945312, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 5.569096209912536, "grad_norm": 0.144125834107399, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 395253144.0, "reward": 0.65625, "reward_std": 0.17434383928775787, "rewards/simpleverify_reward/mean": 0.65625, "rewards/simpleverify_reward/std": 0.4752241373062134, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 586.1741333007812, "completions/mean_terminated_length": 582.2525024414062, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 5.578425655976677, "grad_norm": 0.1392761617898941, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 395855940.0, "reward": 0.6495535969734192, "reward_std": 0.1478532999753952, "rewards/simpleverify_reward/mean": 0.6495535969734192, "rewards/simpleverify_reward/std": 0.477376252412796, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2636.0, "completions/mean_length": 616.3069458007812, "completions/mean_terminated_length": 608.5223999023438, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 5.587755102040816, "grad_norm": 0.1378423422574997, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 396492663.0, "reward": 0.606026828289032, "reward_std": 0.14199700951576233, "rewards/simpleverify_reward/mean": 0.6060267686843872, "rewards/simpleverify_reward/std": 0.48890194296836853, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 659.6986694335938, "completions/mean_terminated_length": 620.9142456054688, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 5.597084548104956, "grad_norm": 0.15490345656871796, "learning_rate": 1e-06, "loss": 0.0443, "num_tokens": 397175345.0, "reward": 0.5859375, "reward_std": 0.18768498301506042, "rewards/simpleverify_reward/mean": 0.5859375, "rewards/simpleverify_reward/std": 0.4928344786167145, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3246.0, "completions/mean_length": 680.380615234375, "completions/mean_terminated_length": 641.82958984375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 5.606413994169096, "grad_norm": 0.14204540848731995, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 397872278.0, "reward": 0.6071428656578064, "reward_std": 0.14875483512878418, "rewards/simpleverify_reward/mean": 0.6071428656578064, "rewards/simpleverify_reward/std": 0.48865827918052673, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3786.0, "completions/mean_length": 616.6685791015625, "completions/mean_terminated_length": 608.884765625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 5.615743440233236, "grad_norm": 0.14818234741687775, "learning_rate": 1e-06, "loss": 0.02, "num_tokens": 398513253.0, "reward": 0.6662946939468384, "reward_std": 0.18096108734607697, "rewards/simpleverify_reward/mean": 0.6662946343421936, "rewards/simpleverify_reward/std": 0.47179925441741943, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2531.0, "completions/mean_length": 591.154052734375, "completions/mean_terminated_length": 579.379638671875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 5.625072886297376, "grad_norm": 0.14307425916194916, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 399122527.0, "reward": 0.6930803656578064, "reward_std": 0.16720370948314667, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147337555885315, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2484.0, "completions/mean_length": 634.4252319335938, "completions/mean_terminated_length": 622.7962036132812, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 5.634402332361516, "grad_norm": 0.16487868130207062, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 399785748.0, "reward": 0.6953125596046448, "reward_std": 0.17859388887882233, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3743.0, "completions/mean_length": 666.5870971679688, "completions/mean_terminated_length": 631.790283203125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 5.643731778425656, "grad_norm": 0.14417017996311188, "learning_rate": 1e-06, "loss": 0.0237, "num_tokens": 400469450.0, "reward": 0.6774553656578064, "reward_std": 0.178367480635643, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 620.2455444335938, "completions/mean_terminated_length": 600.7407836914062, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 5.653061224489796, "grad_norm": 0.15549172461032867, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 401114678.0, "reward": 0.6439732313156128, "reward_std": 0.19113412499427795, "rewards/simpleverify_reward/mean": 0.6439732313156128, "rewards/simpleverify_reward/std": 0.47909072041511536, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2460.0, "completions/mean_length": 589.953125, "completions/mean_terminated_length": 586.0357666015625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 5.662390670553936, "grad_norm": 0.16282926499843597, "learning_rate": 1e-06, "loss": 0.0251, "num_tokens": 401730460.0, "reward": 0.6908482313156128, "reward_std": 0.1838574856519699, "rewards/simpleverify_reward/mean": 0.6908482313156128, "rewards/simpleverify_reward/std": 0.46240198612213135, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3699.0, "completions/mean_length": 611.5100708007812, "completions/mean_terminated_length": 595.8845825195312, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 5.671720116618076, "grad_norm": 0.14009907841682434, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 402365165.0, "reward": 0.6819196939468384, "reward_std": 0.1389918178319931, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 600.203125, "completions/mean_terminated_length": 568.70947265625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 5.681049562682215, "grad_norm": 0.13283240795135498, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 402997011.0, "reward": 0.6819196939468384, "reward_std": 0.1268509477376938, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4040.0, "completions/mean_length": 669.7745971679688, "completions/mean_terminated_length": 650.5477294921875, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 5.690379008746356, "grad_norm": 0.14022594690322876, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 403686513.0, "reward": 0.609375, "reward_std": 0.16991788148880005, "rewards/simpleverify_reward/mean": 0.609375, "rewards/simpleverify_reward/std": 0.48816296458244324, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2944.0, "completions/mean_length": 572.6038208007812, "completions/mean_terminated_length": 560.76708984375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 5.699708454810495, "grad_norm": 0.15568582713603973, "learning_rate": 1e-06, "loss": 0.0239, "num_tokens": 404291054.0, "reward": 0.6897321939468384, "reward_std": 0.1554480642080307, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 616.2154541015625, "completions/mean_terminated_length": 608.4306640625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 5.709037900874636, "grad_norm": 0.14864179491996765, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 404938583.0, "reward": 0.6551339626312256, "reward_std": 0.15931974351406097, "rewards/simpleverify_reward/mean": 0.6551339030265808, "rewards/simpleverify_reward/std": 0.4755900502204895, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4019.0, "completions/mean_length": 656.2377319335938, "completions/mean_terminated_length": 633.04833984375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 5.718367346938775, "grad_norm": 0.148314967751503, "learning_rate": 1e-06, "loss": 0.0274, "num_tokens": 405626220.0, "reward": 0.6785714626312256, "reward_std": 0.17844374477863312, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3789.0, "completions/mean_length": 626.9442138671875, "completions/mean_terminated_length": 603.5573120117188, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 5.727696793002916, "grad_norm": 0.1431846022605896, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 406277618.0, "reward": 0.6339285969734192, "reward_std": 0.13876360654830933, "rewards/simpleverify_reward/mean": 0.6339285969734192, "rewards/simpleverify_reward/std": 0.48199835419654846, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2844.0, "completions/mean_length": 609.0480346679688, "completions/mean_terminated_length": 601.2471923828125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 5.737026239067055, "grad_norm": 0.13688473403453827, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 406917061.0, "reward": 0.6473214626312256, "reward_std": 0.13038858771324158, "rewards/simpleverify_reward/mean": 0.6473214030265808, "rewards/simpleverify_reward/std": 0.47807058691978455, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2755.0, "completions/mean_length": 621.59375, "completions/mean_terminated_length": 594.2362670898438, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 5.746355685131196, "grad_norm": 0.14667534828186035, "learning_rate": 1e-06, "loss": 0.0206, "num_tokens": 407562505.0, "reward": 0.6160714626312256, "reward_std": 0.16123977303504944, "rewards/simpleverify_reward/mean": 0.6160714030265808, "rewards/simpleverify_reward/std": 0.486612468957901, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2772.0, "completions/mean_length": 622.560302734375, "completions/mean_terminated_length": 599.143798828125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 5.755685131195335, "grad_norm": 0.14410516619682312, "learning_rate": 1e-06, "loss": 0.0386, "num_tokens": 408213151.0, "reward": 0.6819196939468384, "reward_std": 0.16645807027816772, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 597.8125, "completions/mean_terminated_length": 586.0604858398438, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 5.765014577259475, "grad_norm": 0.1626434028148651, "learning_rate": 1e-06, "loss": 0.0194, "num_tokens": 408835127.0, "reward": 0.6729910969734192, "reward_std": 0.1674005389213562, "rewards/simpleverify_reward/mean": 0.6729910969734192, "rewards/simpleverify_reward/std": 0.46938255429267883, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2691.0, "completions/mean_length": 675.7433471679688, "completions/mean_terminated_length": 660.4058837890625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 5.774344023323615, "grad_norm": 0.11800877749919891, "learning_rate": 1e-06, "loss": 0.016, "num_tokens": 409524881.0, "reward": 0.6651785969734192, "reward_std": 0.1371459811925888, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219157218933105, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3787.0, "completions/mean_length": 624.5089721679688, "completions/mean_terminated_length": 608.9417114257812, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 5.783673469387755, "grad_norm": 0.13916060328483582, "learning_rate": 1e-06, "loss": 0.0332, "num_tokens": 410173945.0, "reward": 0.6383928656578064, "reward_std": 0.13752686977386475, "rewards/simpleverify_reward/mean": 0.6383928656578064, "rewards/simpleverify_reward/std": 0.4807341992855072, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3493.0, "completions/mean_length": 573.2210083007812, "completions/mean_terminated_length": 561.3863525390625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 5.793002915451895, "grad_norm": 0.14485199749469757, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 410784431.0, "reward": 0.6941964626312256, "reward_std": 0.14345942437648773, "rewards/simpleverify_reward/mean": 0.6941964030265808, "rewards/simpleverify_reward/std": 0.4610042870044708, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 641.9520263671875, "completions/mean_terminated_length": 602.96728515625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 5.802332361516035, "grad_norm": 0.14726622402668, "learning_rate": 1e-06, "loss": 0.037, "num_tokens": 411446700.0, "reward": 0.6484375, "reward_std": 0.16758599877357483, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3107.0, "completions/mean_length": 633.7611694335938, "completions/mean_terminated_length": 614.3322143554688, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 5.811661807580175, "grad_norm": 0.13373687863349915, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 412098758.0, "reward": 0.6707589626312256, "reward_std": 0.15097934007644653, "rewards/simpleverify_reward/mean": 0.6707589030265808, "rewards/simpleverify_reward/std": 0.4702001214027405, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3697.0, "completions/mean_length": 673.646240234375, "completions/mean_terminated_length": 654.4411010742188, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 5.820991253644315, "grad_norm": 0.1304638385772705, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 412802745.0, "reward": 0.6238839626312256, "reward_std": 0.14027062058448792, "rewards/simpleverify_reward/mean": 0.6238839030265808, "rewards/simpleverify_reward/std": 0.48468026518821716, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3573.0, "completions/mean_length": 674.864990234375, "completions/mean_terminated_length": 647.9269409179688, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 5.830320699708455, "grad_norm": 0.13859499990940094, "learning_rate": 1e-06, "loss": 0.0196, "num_tokens": 413509200.0, "reward": 0.6127232313156128, "reward_std": 0.1599259227514267, "rewards/simpleverify_reward/mean": 0.6127232313156128, "rewards/simpleverify_reward/std": 0.4873998463153839, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2392.0, "completions/mean_length": 599.739990234375, "completions/mean_terminated_length": 587.9944458007812, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 5.839650145772595, "grad_norm": 0.1537138819694519, "learning_rate": 1e-06, "loss": 0.0168, "num_tokens": 414126199.0, "reward": 0.7209821939468384, "reward_std": 0.17389875650405884, "rewards/simpleverify_reward/mean": 0.7209821343421936, "rewards/simpleverify_reward/std": 0.448766827583313, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3811.0, "completions/mean_length": 569.380615234375, "completions/mean_terminated_length": 561.4910278320312, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 5.848979591836734, "grad_norm": 0.148297980427742, "learning_rate": 1e-06, "loss": 0.023, "num_tokens": 414716340.0, "reward": 0.6495535969734192, "reward_std": 0.12084352970123291, "rewards/simpleverify_reward/mean": 0.6495535969734192, "rewards/simpleverify_reward/std": 0.477376252412796, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 610.739990234375, "completions/mean_terminated_length": 591.1818237304688, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 5.858309037900875, "grad_norm": 0.1489398181438446, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 415358523.0, "reward": 0.6584821939468384, "reward_std": 0.1561630219221115, "rewards/simpleverify_reward/mean": 0.6584821343421936, "rewards/simpleverify_reward/std": 0.4744836091995239, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2903.0, "completions/mean_length": 636.7645263671875, "completions/mean_terminated_length": 617.3524169921875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 5.867638483965014, "grad_norm": 0.1494756042957306, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 416009368.0, "reward": 0.6428571939468384, "reward_std": 0.16533716022968292, "rewards/simpleverify_reward/mean": 0.6428571343421936, "rewards/simpleverify_reward/std": 0.4794250428676605, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3169.0, "completions/mean_length": 636.4252319335938, "completions/mean_terminated_length": 617.01123046875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 5.876967930029155, "grad_norm": 0.151694193482399, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 416677725.0, "reward": 0.6640625, "reward_std": 0.17066676914691925, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3900.0, "completions/mean_length": 635.6506958007812, "completions/mean_terminated_length": 616.2323608398438, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 5.886297376093294, "grad_norm": 0.13740867376327515, "learning_rate": 1e-06, "loss": 0.0304, "num_tokens": 417339300.0, "reward": 0.6618303656578064, "reward_std": 0.15608564019203186, "rewards/simpleverify_reward/mean": 0.6618303656578064, "rewards/simpleverify_reward/std": 0.4733508229255676, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2581.0, "completions/mean_length": 580.9788208007812, "completions/mean_terminated_length": 569.1702270507812, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 5.895626822157435, "grad_norm": 0.14175787568092346, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 417947545.0, "reward": 0.652901828289032, "reward_std": 0.13655048608779907, "rewards/simpleverify_reward/mean": 0.6529017686843872, "rewards/simpleverify_reward/std": 0.47631317377090454, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2157.0, "completions/mean_length": 630.7957763671875, "completions/mean_terminated_length": 619.154541015625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 5.904956268221574, "grad_norm": 0.15464933216571808, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 418609690.0, "reward": 0.6283482313156128, "reward_std": 0.16574057936668396, "rewards/simpleverify_reward/mean": 0.6283482313156128, "rewards/simpleverify_reward/std": 0.4835159182548523, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 590.9642944335938, "completions/mean_terminated_length": 579.1892700195312, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 5.914285714285715, "grad_norm": 0.13515737652778625, "learning_rate": 1e-06, "loss": 0.0169, "num_tokens": 419220202.0, "reward": 0.6897321939468384, "reward_std": 0.129107266664505, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.462861567735672, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 592.5435791015625, "completions/mean_terminated_length": 580.7738037109375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 5.923615160349854, "grad_norm": 0.15322956442832947, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 419843361.0, "reward": 0.6707589626312256, "reward_std": 0.1662725955247879, "rewards/simpleverify_reward/mean": 0.6707589030265808, "rewards/simpleverify_reward/std": 0.4702001214027405, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3756.0, "completions/mean_length": 651.6373291015625, "completions/mean_terminated_length": 636.1917114257812, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 5.932944606413994, "grad_norm": 0.16265155375003815, "learning_rate": 1e-06, "loss": 0.0315, "num_tokens": 420524628.0, "reward": 0.6495535969734192, "reward_std": 0.1993369162082672, "rewards/simpleverify_reward/mean": 0.6495535969734192, "rewards/simpleverify_reward/std": 0.47737622261047363, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3441.0, "completions/mean_length": 639.2533569335938, "completions/mean_terminated_length": 615.949462890625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 5.942274052478134, "grad_norm": 0.14435029029846191, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 421184519.0, "reward": 0.6462053656578064, "reward_std": 0.15393991768360138, "rewards/simpleverify_reward/mean": 0.6462053656578064, "rewards/simpleverify_reward/std": 0.478413462638855, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2251.0, "completions/mean_length": 596.6998291015625, "completions/mean_terminated_length": 592.7899169921875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 5.9516034985422746, "grad_norm": 0.14315444231033325, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 421814890.0, "reward": 0.6227678656578064, "reward_std": 0.14447972178459167, "rewards/simpleverify_reward/mean": 0.6227678656578064, "rewards/simpleverify_reward/std": 0.4849644899368286, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3627.0, "completions/mean_length": 666.6819458007812, "completions/mean_terminated_length": 647.437744140625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 5.960932944606414, "grad_norm": 0.14819471538066864, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 422507973.0, "reward": 0.6183035969734192, "reward_std": 0.164321631193161, "rewards/simpleverify_reward/mean": 0.6183035969734192, "rewards/simpleverify_reward/std": 0.4860740303993225, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2137.0, "completions/max_terminated_length": 2137.0, "completions/mean_length": 576.0513916015625, "completions/mean_terminated_length": 576.0513916015625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 5.970262390670554, "grad_norm": 0.15137754380702972, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 423115331.0, "reward": 0.645089328289032, "reward_std": 0.15634456276893616, "rewards/simpleverify_reward/mean": 0.6450892686843872, "rewards/simpleverify_reward/std": 0.4787535071372986, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3639.0, "completions/mean_length": 569.3694458007812, "completions/mean_terminated_length": 561.4798583984375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 5.979591836734694, "grad_norm": 0.13215266168117523, "learning_rate": 1e-06, "loss": 0.0242, "num_tokens": 423705726.0, "reward": 0.6863839626312256, "reward_std": 0.12692762911319733, "rewards/simpleverify_reward/mean": 0.6863839030265808, "rewards/simpleverify_reward/std": 0.46422144770622253, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3282.0, "completions/mean_length": 661.599365234375, "completions/mean_terminated_length": 614.978515625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 5.988921282798834, "grad_norm": 0.17408061027526855, "learning_rate": 1e-06, "loss": 0.0332, "num_tokens": 424398487.0, "reward": 0.606026828289032, "reward_std": 0.19186411798000336, "rewards/simpleverify_reward/mean": 0.6060267686843872, "rewards/simpleverify_reward/std": 0.48890191316604614, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0028409090909090606, "completions/max_length": 4096.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 623.7244262695312, "completions/mean_terminated_length": 613.8319091796875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 5.998250728862974, "grad_norm": 0.12240371853113174, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 425010853.0, "reward": 0.6662946939468384, "reward_std": 0.12290439009666443, "rewards/simpleverify_reward/mean": 0.6662946343421936, "rewards/simpleverify_reward/std": 0.47179925441741943, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4002.0, "completions/mean_length": 651.2980346679688, "completions/mean_terminated_length": 624.1743774414062, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 6.0093294460641395, "grad_norm": 0.14365944266319275, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 425699360.0, "reward": 0.6261160969734192, "reward_std": 0.14538197219371796, "rewards/simpleverify_reward/mean": 0.6261160969734192, "rewards/simpleverify_reward/std": 0.48410359025001526, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3491.0, "completions/mean_length": 570.0223388671875, "completions/mean_terminated_length": 550.2357177734375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 6.01865889212828, "grad_norm": 0.15465058386325836, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 426287716.0, "reward": 0.7209821939468384, "reward_std": 0.15165013074874878, "rewards/simpleverify_reward/mean": 0.7209821343421936, "rewards/simpleverify_reward/std": 0.448766827583313, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3827.0, "completions/mean_length": 636.5848388671875, "completions/mean_terminated_length": 621.07177734375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 6.0279883381924195, "grad_norm": 0.1801861673593521, "learning_rate": 1e-06, "loss": 0.024, "num_tokens": 426949624.0, "reward": 0.660714328289032, "reward_std": 0.20166556537151337, "rewards/simpleverify_reward/mean": 0.6607142686843872, "rewards/simpleverify_reward/std": 0.4737313687801361, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 4045.0, "completions/mean_length": 633.833740234375, "completions/mean_terminated_length": 629.96533203125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 6.03731778425656, "grad_norm": 0.11371079832315445, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 427618371.0, "reward": 0.5848214626312256, "reward_std": 0.1065223291516304, "rewards/simpleverify_reward/mean": 0.5848214030265808, "rewards/simpleverify_reward/std": 0.49302801489830017, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4037.0, "completions/mean_length": 700.8895263671875, "completions/mean_terminated_length": 674.1563720703125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 6.0466472303206995, "grad_norm": 0.14044605195522308, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 428330704.0, "reward": 0.6584821939468384, "reward_std": 0.15995845198631287, "rewards/simpleverify_reward/mean": 0.6584821343421936, "rewards/simpleverify_reward/std": 0.4744836091995239, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 607.3761596679688, "completions/mean_terminated_length": 595.65625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 6.05597667638484, "grad_norm": 0.14227697253227234, "learning_rate": 1e-06, "loss": 0.0246, "num_tokens": 428962449.0, "reward": 0.676339328289032, "reward_std": 0.1502639651298523, "rewards/simpleverify_reward/mean": 0.6763392686843872, "rewards/simpleverify_reward/std": 0.4681335687637329, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2174.0, "completions/mean_length": 641.0491333007812, "completions/mean_terminated_length": 621.6610717773438, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 6.0653061224489795, "grad_norm": 0.14231666922569275, "learning_rate": 1e-06, "loss": 0.0169, "num_tokens": 429624549.0, "reward": 0.6618303656578064, "reward_std": 0.14305077493190765, "rewards/simpleverify_reward/mean": 0.6618303656578064, "rewards/simpleverify_reward/std": 0.4733508229255676, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2124.0, "completions/max_terminated_length": 2124.0, "completions/mean_length": 565.125, "completions/mean_terminated_length": 565.125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 6.07463556851312, "grad_norm": 0.1557060033082962, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 430222237.0, "reward": 0.6495535969734192, "reward_std": 0.15353623032569885, "rewards/simpleverify_reward/mean": 0.6495535969734192, "rewards/simpleverify_reward/std": 0.477376252412796, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2888.0, "completions/mean_length": 577.4486694335938, "completions/mean_terminated_length": 565.6282348632812, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 6.0839650145772595, "grad_norm": 0.13942131400108337, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 430830623.0, "reward": 0.6696428656578064, "reward_std": 0.12171045690774918, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.47060438990592957, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 574.6417846679688, "completions/mean_terminated_length": 562.8118896484375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 6.093294460641399, "grad_norm": 0.1490231603384018, "learning_rate": 1e-06, "loss": -0.0039, "num_tokens": 431433678.0, "reward": 0.668526828289032, "reward_std": 0.13433806598186493, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3200.0, "completions/mean_length": 609.075927734375, "completions/mean_terminated_length": 597.3616943359375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 6.1026239067055394, "grad_norm": 0.14733345806598663, "learning_rate": 1e-06, "loss": 0.0152, "num_tokens": 432070202.0, "reward": 0.6640625, "reward_std": 0.16679325699806213, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3637.0, "completions/mean_length": 654.4096069335938, "completions/mean_terminated_length": 635.0965576171875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 6.111953352769679, "grad_norm": 0.13968029618263245, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 432753769.0, "reward": 0.6283482313156128, "reward_std": 0.17821411788463593, "rewards/simpleverify_reward/mean": 0.6283482313156128, "rewards/simpleverify_reward/std": 0.4835159480571747, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3278.0, "completions/mean_length": 586.0881958007812, "completions/mean_terminated_length": 566.3917236328125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 6.121282798833819, "grad_norm": 0.15436553955078125, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 433361048.0, "reward": 0.6484375, "reward_std": 0.1607826054096222, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2491.0, "completions/mean_length": 580.1105346679688, "completions/mean_terminated_length": 564.3441772460938, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 6.130612244897959, "grad_norm": 0.14804790914058685, "learning_rate": 1e-06, "loss": 0.03, "num_tokens": 433958059.0, "reward": 0.7220982313156128, "reward_std": 0.13842660188674927, "rewards/simpleverify_reward/mean": 0.7220982313156128, "rewards/simpleverify_reward/std": 0.44821491837501526, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3989.0, "completions/mean_length": 635.1038208007812, "completions/mean_terminated_length": 619.5841064453125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 6.139941690962099, "grad_norm": 0.1388985961675644, "learning_rate": 1e-06, "loss": 0.0115, "num_tokens": 434612112.0, "reward": 0.6852678656578064, "reward_std": 0.1552600860595703, "rewards/simpleverify_reward/mean": 0.6852678656578064, "rewards/simpleverify_reward/std": 0.46466848254203796, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3298.0, "completions/mean_length": 639.9296875, "completions/mean_terminated_length": 608.7939453125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 6.149271137026239, "grad_norm": 0.15533199906349182, "learning_rate": 1e-06, "loss": 0.0283, "num_tokens": 435279297.0, "reward": 0.6540178656578064, "reward_std": 0.16852892935276031, "rewards/simpleverify_reward/mean": 0.6540178656578064, "rewards/simpleverify_reward/std": 0.475953072309494, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2111.0, "completions/mean_length": 579.9308471679688, "completions/mean_terminated_length": 572.0648803710938, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 6.158600583090379, "grad_norm": 0.14025883376598358, "learning_rate": 1e-06, "loss": 0.0149, "num_tokens": 435883987.0, "reward": 0.731026828289032, "reward_std": 0.14165930449962616, "rewards/simpleverify_reward/mean": 0.7310267686843872, "rewards/simpleverify_reward/std": 0.44367367029190063, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3335.0, "completions/mean_length": 606.4006958007812, "completions/mean_terminated_length": 598.5939331054688, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 6.167930029154519, "grad_norm": 0.13194501399993896, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 436513866.0, "reward": 0.6573660969734192, "reward_std": 0.1525152325630188, "rewards/simpleverify_reward/mean": 0.6573660969734192, "rewards/simpleverify_reward/std": 0.47485533356666565, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3320.0, "completions/mean_length": 674.2935791015625, "completions/mean_terminated_length": 655.092041015625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 6.1772594752186585, "grad_norm": 0.13218291103839874, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 437220313.0, "reward": 0.590401828289032, "reward_std": 0.1504140943288803, "rewards/simpleverify_reward/mean": 0.5904017686843872, "rewards/simpleverify_reward/std": 0.49203425645828247, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2455.0, "completions/mean_length": 603.2388916015625, "completions/mean_terminated_length": 595.425048828125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 6.186588921282799, "grad_norm": 0.1445014327764511, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 437843743.0, "reward": 0.7053571939468384, "reward_std": 0.14955787360668182, "rewards/simpleverify_reward/mean": 0.7053571343421936, "rewards/simpleverify_reward/std": 0.45613667368888855, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2775.0, "completions/mean_length": 583.453125, "completions/mean_terminated_length": 571.6528930664062, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 6.1959183673469385, "grad_norm": 0.1493803709745407, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 438459933.0, "reward": 0.7187500596046448, "reward_std": 0.15349414944648743, "rewards/simpleverify_reward/mean": 0.71875, "rewards/simpleverify_reward/std": 0.4498603343963623, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2691.0, "completions/mean_length": 598.0971069335938, "completions/mean_terminated_length": 590.2717895507812, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 6.205247813411079, "grad_norm": 0.12528078258037567, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 439076956.0, "reward": 0.6383928656578064, "reward_std": 0.12752537429332733, "rewards/simpleverify_reward/mean": 0.6383928656578064, "rewards/simpleverify_reward/std": 0.4807341992855072, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2113.0, "completions/max_terminated_length": 2113.0, "completions/mean_length": 614.6529541015625, "completions/mean_terminated_length": 614.6529541015625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 6.214577259475218, "grad_norm": 0.1415206491947174, "learning_rate": 1e-06, "loss": 0.0131, "num_tokens": 439726565.0, "reward": 0.5535714626312256, "reward_std": 0.12054044008255005, "rewards/simpleverify_reward/mean": 0.5535714030265808, "rewards/simpleverify_reward/std": 0.4973994791507721, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 577.1506958007812, "completions/mean_terminated_length": 561.37109375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 6.223906705539359, "grad_norm": 0.15647977590560913, "learning_rate": 1e-06, "loss": 0.017, "num_tokens": 440322068.0, "reward": 0.6819196939468384, "reward_std": 0.15277989208698273, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3003.0, "completions/max_terminated_length": 3003.0, "completions/mean_length": 659.654052734375, "completions/mean_terminated_length": 659.654052734375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 6.233236151603498, "grad_norm": 0.13468487560749054, "learning_rate": 1e-06, "loss": 0.0087, "num_tokens": 440995238.0, "reward": 0.6741071939468384, "reward_std": 0.14751699566841125, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3752.0, "completions/mean_length": 679.693115234375, "completions/mean_terminated_length": 660.5219116210938, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 6.242565597667639, "grad_norm": 0.15833649039268494, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 441700027.0, "reward": 0.6517857313156128, "reward_std": 0.18157020211219788, "rewards/simpleverify_reward/mean": 0.6517857313156128, "rewards/simpleverify_reward/std": 0.47667041420936584, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2794.0, "completions/mean_length": 618.8303833007812, "completions/mean_terminated_length": 607.1489868164062, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 6.251895043731778, "grad_norm": 0.14377883076667786, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 442345851.0, "reward": 0.6037946939468384, "reward_std": 0.13895723223686218, "rewards/simpleverify_reward/mean": 0.6037946343421936, "rewards/simpleverify_reward/std": 0.48938122391700745, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3896.0, "completions/mean_length": 614.5011596679688, "completions/mean_terminated_length": 579.1758422851562, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 6.261224489795918, "grad_norm": 0.13852624595165253, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 442986676.0, "reward": 0.684151828289032, "reward_std": 0.1306488960981369, "rewards/simpleverify_reward/mean": 0.6841517686843872, "rewards/simpleverify_reward/std": 0.4651124179363251, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2469.0, "completions/mean_length": 589.7199096679688, "completions/mean_terminated_length": 585.80224609375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 6.270553935860058, "grad_norm": 0.15899699926376343, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 443604105.0, "reward": 0.6975446939468384, "reward_std": 0.1709684431552887, "rewards/simpleverify_reward/mean": 0.6975446343421936, "rewards/simpleverify_reward/std": 0.45957788825035095, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 631.8772583007812, "completions/mean_terminated_length": 612.437744140625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 6.279883381924198, "grad_norm": 0.14165626466274261, "learning_rate": 1e-06, "loss": 0.0182, "num_tokens": 444264571.0, "reward": 0.6752232313156128, "reward_std": 0.14969801902770996, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 602.8314819335938, "completions/mean_terminated_length": 591.0963134765625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 6.289212827988338, "grad_norm": 0.15478475391864777, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 444904436.0, "reward": 0.6395089626312256, "reward_std": 0.16322647035121918, "rewards/simpleverify_reward/mean": 0.6395089030265808, "rewards/simpleverify_reward/std": 0.4804111421108246, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 609.0390625, "completions/mean_terminated_length": 605.1430053710938, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 6.298542274052478, "grad_norm": 0.12949661910533905, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 445536975.0, "reward": 0.6897321939468384, "reward_std": 0.14206933975219727, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 663.2421875, "completions/mean_terminated_length": 628.4114990234375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 6.307871720116618, "grad_norm": 0.13532918691635132, "learning_rate": 1e-06, "loss": 0.0278, "num_tokens": 446206984.0, "reward": 0.6819196939468384, "reward_std": 0.1642419993877411, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2728.0, "completions/mean_length": 671.6160888671875, "completions/mean_terminated_length": 648.5303344726562, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 6.317201166180758, "grad_norm": 0.14881631731987, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 446895856.0, "reward": 0.6595982313156128, "reward_std": 0.15947198867797852, "rewards/simpleverify_reward/mean": 0.6595982313156128, "rewards/simpleverify_reward/std": 0.4741089344024658, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3248.0, "completions/mean_length": 623.6830444335938, "completions/mean_terminated_length": 600.274169921875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 6.326530612244898, "grad_norm": 0.14084142446517944, "learning_rate": 1e-06, "loss": 0.0192, "num_tokens": 447546036.0, "reward": 0.6629464626312256, "reward_std": 0.13868804275989532, "rewards/simpleverify_reward/mean": 0.6629464030265808, "rewards/simpleverify_reward/std": 0.47296738624572754, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 598.8460083007812, "completions/mean_terminated_length": 587.0974731445312, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 6.335860058309038, "grad_norm": 0.1305694580078125, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 448185650.0, "reward": 0.6484375, "reward_std": 0.12069197744131088, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2509.0, "completions/mean_length": 634.7076416015625, "completions/mean_terminated_length": 599.5873413085938, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 6.345189504373177, "grad_norm": 0.15119941532611847, "learning_rate": 1e-06, "loss": 0.0361, "num_tokens": 448842844.0, "reward": 0.6953125596046448, "reward_std": 0.15556389093399048, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2860.0, "completions/mean_length": 639.0870971679688, "completions/mean_terminated_length": 631.3534545898438, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 6.354518950437318, "grad_norm": 0.15155895054340363, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 449501330.0, "reward": 0.6618303656578064, "reward_std": 0.1652202159166336, "rewards/simpleverify_reward/mean": 0.6618303656578064, "rewards/simpleverify_reward/std": 0.4733508229255676, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3415.0, "completions/mean_length": 677.671875, "completions/mean_terminated_length": 658.4893798828125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 6.363848396501457, "grad_norm": 0.15595096349716187, "learning_rate": 1e-06, "loss": 0.012, "num_tokens": 450199324.0, "reward": 0.5881696939468384, "reward_std": 0.17386597394943237, "rewards/simpleverify_reward/mean": 0.5881696343421936, "rewards/simpleverify_reward/std": 0.4924395978450775, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 628.1685791015625, "completions/mean_terminated_length": 612.6177368164062, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 6.373177842565598, "grad_norm": 0.14043304324150085, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 450857507.0, "reward": 0.6417410969734192, "reward_std": 0.1529282182455063, "rewards/simpleverify_reward/mean": 0.6417410969734192, "rewards/simpleverify_reward/std": 0.47975659370422363, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2419.0, "completions/mean_length": 591.1652221679688, "completions/mean_terminated_length": 587.2491455078125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 6.382507288629737, "grad_norm": 0.15665680170059204, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 451479791.0, "reward": 0.7064732313156128, "reward_std": 0.16555652022361755, "rewards/simpleverify_reward/mean": 0.7064732313156128, "rewards/simpleverify_reward/std": 0.4556320011615753, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3471.0, "completions/mean_length": 664.6160888671875, "completions/mean_terminated_length": 653.0885009765625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 6.391836734693878, "grad_norm": 0.1584169715642929, "learning_rate": 1e-06, "loss": 0.0188, "num_tokens": 452165607.0, "reward": 0.613839328289032, "reward_std": 0.18599504232406616, "rewards/simpleverify_reward/mean": 0.6138392686843872, "rewards/simpleverify_reward/std": 0.48714008927345276, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3176.0, "completions/mean_length": 587.818115234375, "completions/mean_terminated_length": 564.1674194335938, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 6.401166180758017, "grad_norm": 0.14397628605365753, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 452783940.0, "reward": 0.7555803656578064, "reward_std": 0.1462068110704422, "rewards/simpleverify_reward/mean": 0.7555803656578064, "rewards/simpleverify_reward/std": 0.42998260259628296, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 606.2689819335938, "completions/mean_terminated_length": 594.5453491210938, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 6.410495626822158, "grad_norm": 0.15355773270130157, "learning_rate": 1e-06, "loss": 0.0265, "num_tokens": 453414661.0, "reward": 0.6462053656578064, "reward_std": 0.17757612466812134, "rewards/simpleverify_reward/mean": 0.6462053656578064, "rewards/simpleverify_reward/std": 0.478413462638855, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2410.0, "completions/mean_length": 665.404052734375, "completions/mean_terminated_length": 638.3914794921875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 6.419825072886297, "grad_norm": 0.14034879207611084, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 454095295.0, "reward": 0.6640625, "reward_std": 0.17622952163219452, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3150.0, "completions/mean_length": 656.6730346679688, "completions/mean_terminated_length": 633.4865112304688, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 6.429154518950437, "grad_norm": 0.136312335729599, "learning_rate": 1e-06, "loss": 0.0262, "num_tokens": 454771226.0, "reward": 0.6584821939468384, "reward_std": 0.1493610143661499, "rewards/simpleverify_reward/mean": 0.6584821343421936, "rewards/simpleverify_reward/std": 0.4744836091995239, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3519.0, "completions/mean_length": 649.5045166015625, "completions/mean_terminated_length": 637.9260864257812, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 6.438483965014577, "grad_norm": 0.12762826681137085, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 455444046.0, "reward": 0.6350446939468384, "reward_std": 0.12884946167469025, "rewards/simpleverify_reward/mean": 0.6350446343421936, "rewards/simpleverify_reward/std": 0.481686532497406, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2256.0, "completions/mean_length": 607.2455444335938, "completions/mean_terminated_length": 591.6009521484375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 6.447813411078717, "grad_norm": 0.15944871306419373, "learning_rate": 1e-06, "loss": 0.0155, "num_tokens": 456072970.0, "reward": 0.6897321939468384, "reward_std": 0.16777077317237854, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3177.0, "completions/mean_length": 646.6328125, "completions/mean_terminated_length": 638.9161376953125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 6.457142857142857, "grad_norm": 0.15672117471694946, "learning_rate": 1e-06, "loss": 0.0189, "num_tokens": 456740825.0, "reward": 0.6774553656578064, "reward_std": 0.1849508136510849, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3129.0, "completions/mean_length": 590.5714721679688, "completions/mean_terminated_length": 582.7293090820312, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 6.466472303206997, "grad_norm": 0.14095713198184967, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 457360265.0, "reward": 0.6908482313156128, "reward_std": 0.13098520040512085, "rewards/simpleverify_reward/mean": 0.6908482313156128, "rewards/simpleverify_reward/std": 0.46240198612213135, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3842.0, "completions/mean_length": 634.4241333007812, "completions/mean_terminated_length": 607.1676025390625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 6.475801749271137, "grad_norm": 0.14606495201587677, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 458011101.0, "reward": 0.6964285969734192, "reward_std": 0.1579635739326477, "rewards/simpleverify_reward/mean": 0.6964285969734192, "rewards/simpleverify_reward/std": 0.4600565731525421, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 567.2410888671875, "completions/mean_terminated_length": 547.4388427734375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 6.485131195335277, "grad_norm": 0.15165451169013977, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 458610917.0, "reward": 0.668526828289032, "reward_std": 0.15138868987560272, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2936.0, "completions/mean_length": 599.6908569335938, "completions/mean_terminated_length": 591.869140625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 6.494460641399417, "grad_norm": 0.15630675852298737, "learning_rate": 1e-06, "loss": 0.0152, "num_tokens": 459235600.0, "reward": 0.6584821939468384, "reward_std": 0.15273599326610565, "rewards/simpleverify_reward/mean": 0.6584821343421936, "rewards/simpleverify_reward/std": 0.4744836091995239, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2698.0, "completions/mean_length": 615.734375, "completions/mean_terminated_length": 604.0426025390625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 6.503790087463557, "grad_norm": 0.16749347746372223, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 459880890.0, "reward": 0.6339285969734192, "reward_std": 0.17927998304367065, "rewards/simpleverify_reward/mean": 0.6339285969734192, "rewards/simpleverify_reward/std": 0.48199835419654846, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2755.0, "completions/mean_length": 643.1730346679688, "completions/mean_terminated_length": 635.4485473632812, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 6.513119533527696, "grad_norm": 0.15098921954631805, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 460553845.0, "reward": 0.6696428656578064, "reward_std": 0.1554890275001526, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.47060438990592957, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3958.0, "completions/mean_length": 656.5011596679688, "completions/mean_terminated_length": 629.41845703125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 6.522448979591837, "grad_norm": 0.1620839685201645, "learning_rate": 1e-06, "loss": 0.0568, "num_tokens": 461241822.0, "reward": 0.6785714626312256, "reward_std": 0.16897539794445038, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2869.0, "completions/mean_length": 572.0949096679688, "completions/mean_terminated_length": 564.21142578125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 6.531778425655976, "grad_norm": 0.14321674406528473, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 461844147.0, "reward": 0.738839328289032, "reward_std": 0.13391439616680145, "rewards/simpleverify_reward/mean": 0.7388392686843872, "rewards/simpleverify_reward/std": 0.439512699842453, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1866.0, "completions/mean_length": 575.4799194335938, "completions/mean_terminated_length": 567.60400390625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 6.541107871720117, "grad_norm": 0.13240191340446472, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 462439961.0, "reward": 0.6886160969734192, "reward_std": 0.13211314380168915, "rewards/simpleverify_reward/mean": 0.6886160969734192, "rewards/simpleverify_reward/std": 0.46331802010536194, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2600.0, "completions/mean_length": 626.7455444335938, "completions/mean_terminated_length": 615.0906982421875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 6.550437317784256, "grad_norm": 0.13784655928611755, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 463086661.0, "reward": 0.6741071939468384, "reward_std": 0.12576690316200256, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 587.890625, "completions/mean_terminated_length": 583.970947265625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 6.559766763848397, "grad_norm": 0.1550212949514389, "learning_rate": 1e-06, "loss": 0.017, "num_tokens": 463697715.0, "reward": 0.668526828289032, "reward_std": 0.14304685592651367, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3689.0, "completions/mean_length": 682.9006958007812, "completions/mean_terminated_length": 675.2650756835938, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 6.569096209912536, "grad_norm": 0.15007971227169037, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 464385938.0, "reward": 0.6517857313156128, "reward_std": 0.14199630916118622, "rewards/simpleverify_reward/mean": 0.6517857313156128, "rewards/simpleverify_reward/std": 0.47667041420936584, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2150.0, "completions/mean_length": 577.2980346679688, "completions/mean_terminated_length": 573.366455078125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 6.578425655976677, "grad_norm": 0.17092856764793396, "learning_rate": 1e-06, "loss": 0.0318, "num_tokens": 464988749.0, "reward": 0.7176339626312256, "reward_std": 0.15154317021369934, "rewards/simpleverify_reward/mean": 0.7176339030265808, "rewards/simpleverify_reward/std": 0.4504019320011139, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3427.0, "completions/mean_length": 655.3995971679688, "completions/mean_terminated_length": 632.2045288085938, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 6.587755102040816, "grad_norm": 0.15398958325386047, "learning_rate": 1e-06, "loss": 0.0328, "num_tokens": 465657147.0, "reward": 0.6953125596046448, "reward_std": 0.1628890484571457, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2888.0, "completions/mean_length": 646.1920166015625, "completions/mean_terminated_length": 626.8328247070312, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 6.597084548104956, "grad_norm": 0.14689505100250244, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 466329191.0, "reward": 0.6350446939468384, "reward_std": 0.16791978478431702, "rewards/simpleverify_reward/mean": 0.6350446343421936, "rewards/simpleverify_reward/std": 0.4816865026950836, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2617.0, "completions/mean_length": 607.4810791015625, "completions/mean_terminated_length": 591.8374633789062, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 6.606413994169096, "grad_norm": 0.14494812488555908, "learning_rate": 1e-06, "loss": 0.0162, "num_tokens": 466964886.0, "reward": 0.6696428656578064, "reward_std": 0.14980609714984894, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.47060438990592957, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 571.3638916015625, "completions/mean_terminated_length": 555.558349609375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 6.615743440233236, "grad_norm": 0.18044592440128326, "learning_rate": 1e-06, "loss": 0.0388, "num_tokens": 467566956.0, "reward": 0.6930803656578064, "reward_std": 0.1832936555147171, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147334575653076, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3612.0, "completions/mean_length": 620.8449096679688, "completions/mean_terminated_length": 601.3434448242188, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 6.625072886297376, "grad_norm": 0.1749872863292694, "learning_rate": 1e-06, "loss": 0.0353, "num_tokens": 468212529.0, "reward": 0.6573660969734192, "reward_std": 0.193587988615036, "rewards/simpleverify_reward/mean": 0.6573660969734192, "rewards/simpleverify_reward/std": 0.47485533356666565, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3279.0, "completions/mean_length": 639.375, "completions/mean_terminated_length": 608.2342529296875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 6.634402332361516, "grad_norm": 0.14375221729278564, "learning_rate": 1e-06, "loss": 0.0413, "num_tokens": 468885201.0, "reward": 0.676339328289032, "reward_std": 0.16281278431415558, "rewards/simpleverify_reward/mean": 0.6763392686843872, "rewards/simpleverify_reward/std": 0.4681335985660553, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2452.0, "completions/mean_length": 635.122802734375, "completions/mean_terminated_length": 615.7014770507812, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 6.643731778425656, "grad_norm": 0.1394072026014328, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 469538127.0, "reward": 0.6573660969734192, "reward_std": 0.1371157020330429, "rewards/simpleverify_reward/mean": 0.6573660969734192, "rewards/simpleverify_reward/std": 0.47485533356666565, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3320.0, "completions/mean_length": 603.8392944335938, "completions/mean_terminated_length": 596.02685546875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 6.653061224489796, "grad_norm": 0.15454906225204468, "learning_rate": 1e-06, "loss": 0.0211, "num_tokens": 470175535.0, "reward": 0.5970982313156128, "reward_std": 0.17547248303890228, "rewards/simpleverify_reward/mean": 0.5970982313156128, "rewards/simpleverify_reward/std": 0.4907552897930145, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2671.0, "completions/mean_length": 589.8471069335938, "completions/mean_terminated_length": 578.0682983398438, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 6.662390670553936, "grad_norm": 0.1273425966501236, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 470795054.0, "reward": 0.6774553656578064, "reward_std": 0.09743304550647736, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3308.0, "completions/mean_length": 622.0513916015625, "completions/mean_terminated_length": 610.3807373046875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 6.671720116618076, "grad_norm": 0.14251548051834106, "learning_rate": 1e-06, "loss": 0.0201, "num_tokens": 471451524.0, "reward": 0.637276828289032, "reward_std": 0.1162225604057312, "rewards/simpleverify_reward/mean": 0.6372767686843872, "rewards/simpleverify_reward/std": 0.481054425239563, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3322.0, "completions/mean_length": 618.9765625, "completions/mean_terminated_length": 611.197998046875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 6.681049562682215, "grad_norm": 0.15564028918743134, "learning_rate": 1e-06, "loss": 0.0132, "num_tokens": 472096863.0, "reward": 0.6741071939468384, "reward_std": 0.13655118644237518, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692258834839, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3628.0, "completions/mean_length": 643.9542846679688, "completions/mean_terminated_length": 597.0939331054688, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 6.690379008746356, "grad_norm": 0.15273889899253845, "learning_rate": 1e-06, "loss": 0.0115, "num_tokens": 472764598.0, "reward": 0.6171875, "reward_std": 0.17107722163200378, "rewards/simpleverify_reward/mean": 0.6171875, "rewards/simpleverify_reward/std": 0.4863446056842804, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4049.0, "completions/mean_length": 649.6339721679688, "completions/mean_terminated_length": 626.4000244140625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 6.699708454810495, "grad_norm": 0.12636427581310272, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 473435678.0, "reward": 0.625, "reward_std": 0.12700361013412476, "rewards/simpleverify_reward/mean": 0.625, "rewards/simpleverify_reward/std": 0.48439329862594604, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3616.0, "completions/mean_length": 643.2154541015625, "completions/mean_terminated_length": 619.938232421875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 6.709037900874636, "grad_norm": 0.14510133862495422, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 474096319.0, "reward": 0.668526828289032, "reward_std": 0.15045757591724396, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1846.0, "completions/mean_length": 597.1808471679688, "completions/mean_terminated_length": 577.546630859375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 6.718367346938775, "grad_norm": 0.1411646157503128, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 474730273.0, "reward": 0.6707589626312256, "reward_std": 0.14515192806720734, "rewards/simpleverify_reward/mean": 0.6707589030265808, "rewards/simpleverify_reward/std": 0.4702001214027405, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2672.0, "completions/mean_length": 624.9319458007812, "completions/mean_terminated_length": 613.27099609375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 6.727696793002916, "grad_norm": 0.13814730942249298, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 475379132.0, "reward": 0.6283482313156128, "reward_std": 0.13673482835292816, "rewards/simpleverify_reward/mean": 0.6283482313156128, "rewards/simpleverify_reward/std": 0.4835159182548523, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4005.0, "completions/mean_length": 628.5692138671875, "completions/mean_terminated_length": 609.1111450195312, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 6.737026239067055, "grad_norm": 0.14395026862621307, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 476023250.0, "reward": 0.6852678656578064, "reward_std": 0.14109477400779724, "rewards/simpleverify_reward/mean": 0.6852678656578064, "rewards/simpleverify_reward/std": 0.46466848254203796, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3621.0, "completions/mean_length": 556.247802734375, "completions/mean_terminated_length": 544.3561401367188, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 6.746355685131196, "grad_norm": 0.1553555130958557, "learning_rate": 1e-06, "loss": 0.0152, "num_tokens": 476624424.0, "reward": 0.668526828289032, "reward_std": 0.13245196640491486, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2636.0, "completions/mean_length": 576.9486694335938, "completions/mean_terminated_length": 573.0167236328125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 6.755685131195335, "grad_norm": 0.1521666795015335, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 477229306.0, "reward": 0.6439732313156128, "reward_std": 0.1173504963517189, "rewards/simpleverify_reward/mean": 0.6439732313156128, "rewards/simpleverify_reward/std": 0.47909072041511536, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2112.0, "completions/mean_length": 581.4967041015625, "completions/mean_terminated_length": 569.6898193359375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 6.765014577259475, "grad_norm": 0.15111105144023895, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 477831359.0, "reward": 0.6953125596046448, "reward_std": 0.14913278818130493, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3089.0, "completions/mean_length": 607.1741333007812, "completions/mean_terminated_length": 595.4535522460938, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 6.774344023323615, "grad_norm": 0.1382700651884079, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 478461035.0, "reward": 0.6361607313156128, "reward_std": 0.12546519935131073, "rewards/simpleverify_reward/mean": 0.6361607313156128, "rewards/simpleverify_reward/std": 0.4813718795776367, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2898.0, "completions/mean_length": 616.140625, "completions/mean_terminated_length": 608.355712890625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 6.783673469387755, "grad_norm": 0.16449080407619476, "learning_rate": 1e-06, "loss": 0.0131, "num_tokens": 479100545.0, "reward": 0.6908482313156128, "reward_std": 0.17664362490177155, "rewards/simpleverify_reward/mean": 0.6908482313156128, "rewards/simpleverify_reward/std": 0.46240198612213135, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2601.0, "completions/mean_length": 611.7846069335938, "completions/mean_terminated_length": 603.9899291992188, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 6.793002915451895, "grad_norm": 0.15928541123867035, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 479739104.0, "reward": 0.6674107313156128, "reward_std": 0.17010079324245453, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2794.0, "completions/mean_length": 616.5892944335938, "completions/mean_terminated_length": 612.70166015625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 6.802332361516035, "grad_norm": 0.16209082305431366, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 480375576.0, "reward": 0.6227678656578064, "reward_std": 0.15161804854869843, "rewards/simpleverify_reward/mean": 0.6227678656578064, "rewards/simpleverify_reward/std": 0.4849644899368286, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 631.4140625, "completions/mean_terminated_length": 600.2015991210938, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 6.811661807580175, "grad_norm": 0.14636172354221344, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 481022699.0, "reward": 0.6729910969734192, "reward_std": 0.14240746200084686, "rewards/simpleverify_reward/mean": 0.6729910969734192, "rewards/simpleverify_reward/std": 0.46938255429267883, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2991.0, "completions/mean_length": 598.3292846679688, "completions/mean_terminated_length": 586.5789794921875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 6.820991253644315, "grad_norm": 0.15765587985515594, "learning_rate": 1e-06, "loss": 0.0258, "num_tokens": 481635338.0, "reward": 0.6171875, "reward_std": 0.15751920640468597, "rewards/simpleverify_reward/mean": 0.6171875, "rewards/simpleverify_reward/std": 0.4863446056842804, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2819.0, "completions/mean_length": 624.5167846679688, "completions/mean_terminated_length": 620.6380004882812, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 6.830320699708455, "grad_norm": 0.14876270294189453, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 482283705.0, "reward": 0.6361607313156128, "reward_std": 0.15157455205917358, "rewards/simpleverify_reward/mean": 0.6361607313156128, "rewards/simpleverify_reward/std": 0.4813718795776367, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3227.0, "completions/mean_length": 646.943115234375, "completions/mean_terminated_length": 619.78515625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 6.839650145772595, "grad_norm": 0.13929235935211182, "learning_rate": 1e-06, "loss": 0.0215, "num_tokens": 482953950.0, "reward": 0.6428571939468384, "reward_std": 0.13655118644237518, "rewards/simpleverify_reward/mean": 0.6428571343421936, "rewards/simpleverify_reward/std": 0.4794250428676605, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3348.0, "completions/mean_length": 611.2098388671875, "completions/mean_terminated_length": 591.6543579101562, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 6.848979591836734, "grad_norm": 0.15709836781024933, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 483602682.0, "reward": 0.652901828289032, "reward_std": 0.1597762107849121, "rewards/simpleverify_reward/mean": 0.6529017686843872, "rewards/simpleverify_reward/std": 0.47631317377090454, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3655.0, "completions/mean_length": 623.7980346679688, "completions/mean_terminated_length": 604.3131713867188, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 6.858309037900875, "grad_norm": 0.1560019552707672, "learning_rate": 1e-06, "loss": 0.0305, "num_tokens": 484250317.0, "reward": 0.6484375, "reward_std": 0.16499380767345428, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2671.0, "completions/mean_length": 617.9710083007812, "completions/mean_terminated_length": 606.2866821289062, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 6.867638483965014, "grad_norm": 0.14572720229625702, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 484903803.0, "reward": 0.6640625, "reward_std": 0.1321898251771927, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 602.575927734375, "completions/mean_terminated_length": 579.0247192382812, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 6.876967930029155, "grad_norm": 0.1348515897989273, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 485543847.0, "reward": 0.6964285969734192, "reward_std": 0.13226468861103058, "rewards/simpleverify_reward/mean": 0.6964285969734192, "rewards/simpleverify_reward/std": 0.4600565731525421, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 581.919677734375, "completions/mean_terminated_length": 554.249755859375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 6.886297376093294, "grad_norm": 0.16022305190563202, "learning_rate": 1e-06, "loss": 0.0131, "num_tokens": 486164263.0, "reward": 0.6875000596046448, "reward_std": 0.1281326860189438, "rewards/simpleverify_reward/mean": 0.6875, "rewards/simpleverify_reward/std": 0.4637712836265564, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 637.029052734375, "completions/mean_terminated_length": 625.4087524414062, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 6.895626822157435, "grad_norm": 0.1485331505537033, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 486829113.0, "reward": 0.6238839626312256, "reward_std": 0.1441006362438202, "rewards/simpleverify_reward/mean": 0.6238839030265808, "rewards/simpleverify_reward/std": 0.48468026518821716, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3189.0, "completions/mean_length": 590.4765625, "completions/mean_terminated_length": 566.8438110351562, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 6.904956268221574, "grad_norm": 0.17279596626758575, "learning_rate": 1e-06, "loss": 0.0386, "num_tokens": 487439364.0, "reward": 0.6674107313156128, "reward_std": 0.14995692670345306, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 645.9096069335938, "completions/mean_terminated_length": 610.9030151367188, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 6.914285714285715, "grad_norm": 0.12470883131027222, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 488112499.0, "reward": 0.59375, "reward_std": 0.12260834872722626, "rewards/simpleverify_reward/mean": 0.59375, "rewards/simpleverify_reward/std": 0.4914066195487976, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3967.0, "completions/mean_length": 579.7723388671875, "completions/mean_terminated_length": 564.0045166015625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 6.923615160349854, "grad_norm": 0.1684962809085846, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 488712463.0, "reward": 0.6808035969734192, "reward_std": 0.19181743264198303, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.46642565727233887, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 584.1986694335938, "completions/mean_terminated_length": 568.45068359375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 6.932944606413994, "grad_norm": 0.16816537082195282, "learning_rate": 1e-06, "loss": 0.0213, "num_tokens": 489320409.0, "reward": 0.6640625, "reward_std": 0.15710732340812683, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3943.0, "completions/mean_length": 577.0904541015625, "completions/mean_terminated_length": 561.310546875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 6.942274052478134, "grad_norm": 0.13904297351837158, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 489932450.0, "reward": 0.6796875596046448, "reward_std": 0.12223179638385773, "rewards/simpleverify_reward/mean": 0.6796875, "rewards/simpleverify_reward/std": 0.4668572247028351, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 644.193115234375, "completions/mean_terminated_length": 613.0957641601562, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 6.9516034985422746, "grad_norm": 0.14963756501674652, "learning_rate": 1e-06, "loss": 0.0318, "num_tokens": 490595951.0, "reward": 0.6071428656578064, "reward_std": 0.1565760225057602, "rewards/simpleverify_reward/mean": 0.6071428656578064, "rewards/simpleverify_reward/std": 0.48865827918052673, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2531.0, "completions/mean_length": 615.3046875, "completions/mean_terminated_length": 595.772216796875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 6.960932944606414, "grad_norm": 0.15365329384803772, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 491237824.0, "reward": 0.652901828289032, "reward_std": 0.14120423793792725, "rewards/simpleverify_reward/mean": 0.6529017686843872, "rewards/simpleverify_reward/std": 0.47631320357322693, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2150.0, "completions/mean_length": 558.5457763671875, "completions/mean_terminated_length": 546.6618041992188, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 6.970262390670554, "grad_norm": 0.18000993132591248, "learning_rate": 1e-06, "loss": 0.0309, "num_tokens": 491829201.0, "reward": 0.6774553656578064, "reward_std": 0.17115361988544464, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3377.0, "completions/mean_length": 596.2902221679688, "completions/mean_terminated_length": 584.5330810546875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 6.979591836734694, "grad_norm": 0.17047137022018433, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 492449989.0, "reward": 0.6316964626312256, "reward_std": 0.19181743264198303, "rewards/simpleverify_reward/mean": 0.6316964030265808, "rewards/simpleverify_reward/std": 0.4826137125492096, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3464.0, "completions/mean_length": 604.185302734375, "completions/mean_terminated_length": 580.6449584960938, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 6.988921282798834, "grad_norm": 0.14452986419200897, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 493071715.0, "reward": 0.691964328289032, "reward_std": 0.1345549076795578, "rewards/simpleverify_reward/mean": 0.6919642686843872, "rewards/simpleverify_reward/std": 0.4619392454624176, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0028409090909090606, "completions/max_length": 4096.0, "completions/max_terminated_length": 1494.0, "completions/mean_length": 548.289794921875, "completions/mean_terminated_length": 538.1823120117188, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 6.998250728862974, "grad_norm": 0.15846364200115204, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 493675768.0, "reward": 0.6328125, "reward_std": 0.15097680687904358, "rewards/simpleverify_reward/mean": 0.6328125, "rewards/simpleverify_reward/std": 0.48230743408203125, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 579.552490234375, "completions/mean_terminated_length": 567.7390747070312, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 7.0093294460641395, "grad_norm": 0.15331104397773743, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 494290935.0, "reward": 0.6495535969734192, "reward_std": 0.15627041459083557, "rewards/simpleverify_reward/mean": 0.6495535969734192, "rewards/simpleverify_reward/std": 0.477376252412796, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 569.4408569335938, "completions/mean_terminated_length": 549.6510009765625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 7.01865889212828, "grad_norm": 0.16318942606449127, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 494892338.0, "reward": 0.6953125596046448, "reward_std": 0.14969801902770996, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 597.171875, "completions/mean_terminated_length": 565.6509399414062, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 7.0279883381924195, "grad_norm": 0.1536768674850464, "learning_rate": 1e-06, "loss": 0.0409, "num_tokens": 495511716.0, "reward": 0.676339328289032, "reward_std": 0.1544366329908371, "rewards/simpleverify_reward/mean": 0.6763392686843872, "rewards/simpleverify_reward/std": 0.4681335985660553, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3749.0, "completions/mean_length": 609.5256958007812, "completions/mean_terminated_length": 593.8912963867188, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 7.03731778425656, "grad_norm": 0.1511920839548111, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 496143507.0, "reward": 0.691964328289032, "reward_std": 0.14086836576461792, "rewards/simpleverify_reward/mean": 0.6919642686843872, "rewards/simpleverify_reward/std": 0.4619392454624176, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2642.0, "completions/mean_length": 592.068115234375, "completions/mean_terminated_length": 564.4780883789062, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 7.0466472303206995, "grad_norm": 0.14457975327968597, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 496760904.0, "reward": 0.6752232313156128, "reward_std": 0.132340669631958, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3816.0, "completions/mean_length": 612.8046875, "completions/mean_terminated_length": 601.10302734375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 7.05597667638484, "grad_norm": 0.14353680610656738, "learning_rate": 1e-06, "loss": 0.0142, "num_tokens": 497410657.0, "reward": 0.6741071939468384, "reward_std": 0.13696163892745972, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3531.0, "completions/mean_length": 602.0011596679688, "completions/mean_terminated_length": 586.3330078125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 7.0653061224489795, "grad_norm": 0.15002726018428802, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 498039122.0, "reward": 0.6629464626312256, "reward_std": 0.14574965834617615, "rewards/simpleverify_reward/mean": 0.6629464030265808, "rewards/simpleverify_reward/std": 0.47296738624572754, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 550.0670166015625, "completions/mean_terminated_length": 534.1659545898438, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 7.07463556851312, "grad_norm": 0.15255022048950195, "learning_rate": 1e-06, "loss": 0.0255, "num_tokens": 498619222.0, "reward": 0.6897321939468384, "reward_std": 0.12903308868408203, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 585.4185791015625, "completions/mean_terminated_length": 553.7916870117188, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 7.0839650145772595, "grad_norm": 0.17639301717281342, "learning_rate": 1e-06, "loss": 0.0165, "num_tokens": 499231749.0, "reward": 0.6674107313156128, "reward_std": 0.20203647017478943, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2168.0, "completions/mean_length": 547.2076416015625, "completions/mean_terminated_length": 539.2684326171875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 7.093294460641399, "grad_norm": 0.15720196068286896, "learning_rate": 1e-06, "loss": 0.0269, "num_tokens": 499814247.0, "reward": 0.6640625, "reward_std": 0.13662464916706085, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3897.0, "completions/mean_length": 575.646240234375, "completions/mean_terminated_length": 567.7706909179688, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 7.1026239067055394, "grad_norm": 0.14691078662872314, "learning_rate": 1e-06, "loss": 0.0204, "num_tokens": 500421946.0, "reward": 0.6696428656578064, "reward_std": 0.13083365559577942, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.47060438990592957, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3395.0, "completions/mean_length": 588.8817138671875, "completions/mean_terminated_length": 569.200927734375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 7.111953352769679, "grad_norm": 0.14059746265411377, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 501032312.0, "reward": 0.6674107313156128, "reward_std": 0.136431023478508, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2799.0, "completions/mean_length": 623.125, "completions/mean_terminated_length": 603.6364135742188, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 7.121282798833819, "grad_norm": 0.14372898638248444, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 501680976.0, "reward": 0.6395089626312256, "reward_std": 0.1310618817806244, "rewards/simpleverify_reward/mean": 0.6395089030265808, "rewards/simpleverify_reward/std": 0.4804111123085022, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3819.0, "completions/mean_length": 583.84375, "completions/mean_terminated_length": 564.1347045898438, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 7.130612244897959, "grad_norm": 0.16338077187538147, "learning_rate": 1e-06, "loss": 0.0241, "num_tokens": 502298852.0, "reward": 0.65625, "reward_std": 0.1785932034254074, "rewards/simpleverify_reward/mean": 0.65625, "rewards/simpleverify_reward/std": 0.4752241373062134, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 586.9152221679688, "completions/mean_terminated_length": 579.0648803710938, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 7.139941690962099, "grad_norm": 0.149337038397789, "learning_rate": 1e-06, "loss": 0.0174, "num_tokens": 502904928.0, "reward": 0.6819196939468384, "reward_std": 0.13831782341003418, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3609.0, "completions/mean_length": 615.6785888671875, "completions/mean_terminated_length": 596.148193359375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 7.149271137026239, "grad_norm": 0.16210536658763885, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 503556048.0, "reward": 0.6428571939468384, "reward_std": 0.16645875573158264, "rewards/simpleverify_reward/mean": 0.6428571343421936, "rewards/simpleverify_reward/std": 0.4794250428676605, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3892.0, "completions/mean_length": 619.5033569335938, "completions/mean_terminated_length": 556.2943115234375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 7.158600583090379, "grad_norm": 0.15227757394313812, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 504194075.0, "reward": 0.6651785969734192, "reward_std": 0.1392507404088974, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219157218933105, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 600.4754638671875, "completions/mean_terminated_length": 565.0078735351562, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 7.167930029154519, "grad_norm": 0.16732628643512726, "learning_rate": 1e-06, "loss": 0.0273, "num_tokens": 504825485.0, "reward": 0.6194196939468384, "reward_std": 0.1648850291967392, "rewards/simpleverify_reward/mean": 0.6194196343421936, "rewards/simpleverify_reward/std": 0.48580074310302734, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3258.0, "completions/mean_length": 550.6551513671875, "completions/mean_terminated_length": 542.7236938476562, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 7.1772594752186585, "grad_norm": 0.1520986407995224, "learning_rate": 1e-06, "loss": 0.0124, "num_tokens": 505404952.0, "reward": 0.7053571939468384, "reward_std": 0.12317357957363129, "rewards/simpleverify_reward/mean": 0.7053571343421936, "rewards/simpleverify_reward/std": 0.45613667368888855, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 589.4732666015625, "completions/mean_terminated_length": 569.7957763671875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 7.186588921282799, "grad_norm": 0.16352282464504242, "learning_rate": 1e-06, "loss": 0.0176, "num_tokens": 506021160.0, "reward": 0.6897321939468384, "reward_std": 0.165178582072258, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.462861567735672, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2876.0, "completions/mean_length": 573.3392944335938, "completions/mean_terminated_length": 569.4033203125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 7.1959183673469385, "grad_norm": 0.15409034490585327, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 506627392.0, "reward": 0.6808035969734192, "reward_std": 0.1287313997745514, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.4664256274700165, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2721.0, "completions/max_terminated_length": 2721.0, "completions/mean_length": 598.8225708007812, "completions/mean_terminated_length": 598.8225708007812, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 7.205247813411079, "grad_norm": 0.1618225872516632, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 507249369.0, "reward": 0.606026828289032, "reward_std": 0.14692038297653198, "rewards/simpleverify_reward/mean": 0.6060267686843872, "rewards/simpleverify_reward/std": 0.48890194296836853, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2132.0, "completions/mean_length": 592.2489013671875, "completions/mean_terminated_length": 588.3340454101562, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 7.214577259475218, "grad_norm": 0.16364146769046783, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 507873416.0, "reward": 0.6886160969734192, "reward_std": 0.14481672644615173, "rewards/simpleverify_reward/mean": 0.6886160969734192, "rewards/simpleverify_reward/std": 0.46331802010536194, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2529.0, "completions/mean_length": 560.3705444335938, "completions/mean_terminated_length": 544.5157470703125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 7.223906705539359, "grad_norm": 0.15597490966320038, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 508470388.0, "reward": 0.6428571939468384, "reward_std": 0.1406715214252472, "rewards/simpleverify_reward/mean": 0.6428571343421936, "rewards/simpleverify_reward/std": 0.4794250428676605, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3377.0, "completions/mean_length": 567.734375, "completions/mean_terminated_length": 539.9527587890625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 7.233236151603498, "grad_norm": 0.15652085840702057, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 509070894.0, "reward": 0.6584821939468384, "reward_std": 0.15646472573280334, "rewards/simpleverify_reward/mean": 0.6584821343421936, "rewards/simpleverify_reward/std": 0.4744836091995239, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 660.4375, "completions/mean_terminated_length": 617.735595703125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 7.242565597667639, "grad_norm": 0.14106586575508118, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 509753934.0, "reward": 0.6696428656578064, "reward_std": 0.1465412974357605, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.47060438990592957, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3827.0, "completions/mean_length": 643.328125, "completions/mean_terminated_length": 616.1417846679688, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 7.251895043731778, "grad_norm": 0.16558492183685303, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 510416340.0, "reward": 0.6629464626312256, "reward_std": 0.1670200526714325, "rewards/simpleverify_reward/mean": 0.6629464030265808, "rewards/simpleverify_reward/std": 0.47296738624572754, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 4017.0, "completions/mean_length": 590.357177734375, "completions/mean_terminated_length": 554.7869262695312, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 7.261224489795918, "grad_norm": 0.146736279129982, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 511027564.0, "reward": 0.7109375596046448, "reward_std": 0.1310618817806244, "rewards/simpleverify_reward/mean": 0.7109375, "rewards/simpleverify_reward/std": 0.45358020067214966, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 598.53125, "completions/mean_terminated_length": 578.9046020507812, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 7.270553935860058, "grad_norm": 0.14659114181995392, "learning_rate": 1e-06, "loss": 0.0314, "num_tokens": 511649320.0, "reward": 0.6729910969734192, "reward_std": 0.14391520619392395, "rewards/simpleverify_reward/mean": 0.6729910969734192, "rewards/simpleverify_reward/std": 0.46938255429267883, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 619.4888916015625, "completions/mean_terminated_length": 603.8991088867188, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 7.279883381924198, "grad_norm": 0.15978503227233887, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 512296918.0, "reward": 0.6026785969734192, "reward_std": 0.1889670193195343, "rewards/simpleverify_reward/mean": 0.6026785969734192, "rewards/simpleverify_reward/std": 0.48961687088012695, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3475.0, "completions/mean_length": 602.5301513671875, "completions/mean_terminated_length": 571.0574340820312, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 7.289212827988338, "grad_norm": 0.13884754478931427, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 512925009.0, "reward": 0.6573660969734192, "reward_std": 0.11899880319833755, "rewards/simpleverify_reward/mean": 0.6573660969734192, "rewards/simpleverify_reward/std": 0.47485533356666565, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2918.0, "completions/mean_length": 577.6796875, "completions/mean_terminated_length": 549.9763793945312, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 7.298542274052478, "grad_norm": 0.14683859050273895, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 513524914.0, "reward": 0.7031250596046448, "reward_std": 0.164046972990036, "rewards/simpleverify_reward/mean": 0.703125, "rewards/simpleverify_reward/std": 0.4571361541748047, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3766.0, "completions/mean_length": 643.8471069335938, "completions/mean_terminated_length": 628.3666381835938, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 7.307871720116618, "grad_norm": 0.16597643494606018, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 514195209.0, "reward": 0.6328125, "reward_std": 0.1991853415966034, "rewards/simpleverify_reward/mean": 0.6328125, "rewards/simpleverify_reward/std": 0.48230743408203125, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3997.0, "completions/mean_length": 596.0714721679688, "completions/mean_terminated_length": 584.3135986328125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 7.317201166180758, "grad_norm": 0.14572110772132874, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 514814633.0, "reward": 0.6808035969734192, "reward_std": 0.15022794902324677, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.46642565727233887, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3839.0, "completions/mean_length": 602.71875, "completions/mean_terminated_length": 567.27392578125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 7.326530612244898, "grad_norm": 0.16443875432014465, "learning_rate": 1e-06, "loss": 0.0323, "num_tokens": 515444613.0, "reward": 0.6930803656578064, "reward_std": 0.178518608212471, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147337555885315, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2202.0, "completions/mean_length": 581.0502319335938, "completions/mean_terminated_length": 561.3255004882812, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 7.335860058309038, "grad_norm": 0.14337275922298431, "learning_rate": 1e-06, "loss": 0.0151, "num_tokens": 516046874.0, "reward": 0.6819196939468384, "reward_std": 0.1330888569355011, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3242.0, "completions/mean_length": 602.0145263671875, "completions/mean_terminated_length": 582.4074096679688, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 7.345189504373177, "grad_norm": 0.1628485918045044, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 516676263.0, "reward": 0.6037946939468384, "reward_std": 0.14263640344142914, "rewards/simpleverify_reward/mean": 0.6037946343421936, "rewards/simpleverify_reward/std": 0.48938122391700745, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 618.8236694335938, "completions/mean_terminated_length": 587.4977416992188, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 7.354518950437318, "grad_norm": 0.15761904418468475, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 517310985.0, "reward": 0.6785714626312256, "reward_std": 0.14789676666259766, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4079.0, "completions/mean_length": 626.1506958007812, "completions/mean_terminated_length": 602.7584228515625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 7.363848396501457, "grad_norm": 0.14488911628723145, "learning_rate": 1e-06, "loss": 0.0236, "num_tokens": 517967584.0, "reward": 0.6808035969734192, "reward_std": 0.1438724249601364, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.4664256274700165, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3558.0, "completions/mean_length": 570.7064819335938, "completions/mean_terminated_length": 566.767578125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 7.373177842565598, "grad_norm": 0.14406876266002655, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 518564777.0, "reward": 0.7299107313156128, "reward_std": 0.12685276567935944, "rewards/simpleverify_reward/mean": 0.7299107313156128, "rewards/simpleverify_reward/std": 0.4442536532878876, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 533.5647583007812, "completions/mean_terminated_length": 517.5897216796875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 7.382507288629737, "grad_norm": 0.1957208812236786, "learning_rate": 1e-06, "loss": 0.0388, "num_tokens": 519132851.0, "reward": 0.7064732313156128, "reward_std": 0.1832933872938156, "rewards/simpleverify_reward/mean": 0.7064732313156128, "rewards/simpleverify_reward/std": 0.4556320011615753, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3891.0, "completions/mean_length": 582.2545166015625, "completions/mean_terminated_length": 550.59912109375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 7.391836734693878, "grad_norm": 0.12703897058963776, "learning_rate": 1e-06, "loss": 0.0149, "num_tokens": 519745127.0, "reward": 0.7276785969734192, "reward_std": 0.10295554995536804, "rewards/simpleverify_reward/mean": 0.7276785969734192, "rewards/simpleverify_reward/std": 0.4454030692577362, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3735.0, "completions/mean_length": 536.544677734375, "completions/mean_terminated_length": 524.5867919921875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 7.401166180758017, "grad_norm": 0.1522340476512909, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 520323543.0, "reward": 0.6506696939468384, "reward_std": 0.12873069941997528, "rewards/simpleverify_reward/mean": 0.6506696343421936, "rewards/simpleverify_reward/std": 0.47702476382255554, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3336.0, "completions/mean_length": 584.1373291015625, "completions/mean_terminated_length": 552.4989013671875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 7.410495626822158, "grad_norm": 0.1501213014125824, "learning_rate": 1e-06, "loss": 0.0249, "num_tokens": 520935978.0, "reward": 0.7053571939468384, "reward_std": 0.13932742178440094, "rewards/simpleverify_reward/mean": 0.7053571343421936, "rewards/simpleverify_reward/std": 0.45613664388656616, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1896.0, "completions/mean_length": 575.6082763671875, "completions/mean_terminated_length": 559.82177734375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 7.419825072886297, "grad_norm": 0.13972879946231842, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 521537571.0, "reward": 0.7165178656578064, "reward_std": 0.10280469805002213, "rewards/simpleverify_reward/mean": 0.7165178656578064, "rewards/simpleverify_reward/std": 0.4509401023387909, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3248.0, "completions/mean_length": 531.7545166015625, "completions/mean_terminated_length": 523.78076171875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 7.429154518950437, "grad_norm": 0.15884695947170258, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 522094295.0, "reward": 0.7332589626312256, "reward_std": 0.10878115892410278, "rewards/simpleverify_reward/mean": 0.7332589030265808, "rewards/simpleverify_reward/std": 0.4425028860569, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1858.0, "completions/mean_length": 657.4654541015625, "completions/mean_terminated_length": 638.1694946289062, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 7.438483965014577, "grad_norm": 0.16596072912216187, "learning_rate": 1e-06, "loss": 0.0204, "num_tokens": 522783000.0, "reward": 0.6015625, "reward_std": 0.18655958771705627, "rewards/simpleverify_reward/mean": 0.6015625, "rewards/simpleverify_reward/std": 0.48984986543655396, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2069.0, "completions/mean_length": 600.8471069335938, "completions/mean_terminated_length": 589.1052856445312, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 7.447813411078717, "grad_norm": 0.17243298888206482, "learning_rate": 1e-06, "loss": 0.0224, "num_tokens": 523408295.0, "reward": 0.6629464626312256, "reward_std": 0.1804732382297516, "rewards/simpleverify_reward/mean": 0.6629464030265808, "rewards/simpleverify_reward/std": 0.47296738624572754, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3916.0, "completions/mean_length": 633.7444458007812, "completions/mean_terminated_length": 614.3154296875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 7.457142857142857, "grad_norm": 0.1631917506456375, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 524072298.0, "reward": 0.6941964626312256, "reward_std": 0.1684536337852478, "rewards/simpleverify_reward/mean": 0.6941964030265808, "rewards/simpleverify_reward/std": 0.4610042870044708, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2821.0, "completions/mean_length": 609.536865234375, "completions/mean_terminated_length": 593.9024658203125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 7.466472303206997, "grad_norm": 0.12731251120567322, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 524710099.0, "reward": 0.6819196939468384, "reward_std": 0.11922521144151688, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3863.0, "completions/mean_length": 620.138427734375, "completions/mean_terminated_length": 608.4613647460938, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 7.475801749271137, "grad_norm": 0.12294919788837433, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 525356975.0, "reward": 0.6930803656578064, "reward_std": 0.10885784029960632, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147334575653076, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2544.0, "completions/mean_length": 618.622802734375, "completions/mean_terminated_length": 603.0291748046875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 7.485131195335277, "grad_norm": 0.1675608605146408, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 525994061.0, "reward": 0.6395089626312256, "reward_std": 0.1698404997587204, "rewards/simpleverify_reward/mean": 0.6395089030265808, "rewards/simpleverify_reward/std": 0.4804111421108246, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2749.0, "completions/mean_length": 612.1663208007812, "completions/mean_terminated_length": 600.4625244140625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 7.494460641399417, "grad_norm": 0.15838921070098877, "learning_rate": 1e-06, "loss": 0.0255, "num_tokens": 526627442.0, "reward": 0.691964328289032, "reward_std": 0.14312425255775452, "rewards/simpleverify_reward/mean": 0.6919642686843872, "rewards/simpleverify_reward/std": 0.4619392454624176, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2654.0, "completions/mean_length": 604.9810791015625, "completions/mean_terminated_length": 593.2531127929688, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 7.503790087463557, "grad_norm": 0.13716107606887817, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 527264177.0, "reward": 0.6674107313156128, "reward_std": 0.13264445960521698, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 588.4620971679688, "completions/mean_terminated_length": 568.7789306640625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 7.513119533527696, "grad_norm": 0.17448517680168152, "learning_rate": 1e-06, "loss": 0.0332, "num_tokens": 527883279.0, "reward": 0.6875000596046448, "reward_std": 0.162174791097641, "rewards/simpleverify_reward/mean": 0.6875, "rewards/simpleverify_reward/std": 0.4637712836265564, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2900.0, "completions/mean_length": 577.5670166015625, "completions/mean_terminated_length": 561.7892456054688, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 7.522448979591837, "grad_norm": 0.18019990622997284, "learning_rate": 1e-06, "loss": 0.0367, "num_tokens": 528485931.0, "reward": 0.668526828289032, "reward_std": 0.19001756608486176, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2676.0, "completions/mean_length": 631.7678833007812, "completions/mean_terminated_length": 608.4135131835938, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 7.531778425655976, "grad_norm": 0.1478065401315689, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 529131283.0, "reward": 0.6674107313156128, "reward_std": 0.13380561769008636, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3329.0, "completions/mean_length": 614.1596069335938, "completions/mean_terminated_length": 606.3702392578125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 7.541107871720117, "grad_norm": 0.15276165306568146, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 529769210.0, "reward": 0.6037946939468384, "reward_std": 0.1253136694431305, "rewards/simpleverify_reward/mean": 0.6037946343421936, "rewards/simpleverify_reward/std": 0.48938119411468506, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3291.0, "completions/max_terminated_length": 3291.0, "completions/mean_length": 608.950927734375, "completions/mean_terminated_length": 608.950927734375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 7.550437317784256, "grad_norm": 0.15439394116401672, "learning_rate": 1e-06, "loss": -0.0042, "num_tokens": 530394222.0, "reward": 0.6082589626312256, "reward_std": 0.1520305871963501, "rewards/simpleverify_reward/mean": 0.6082589030265808, "rewards/simpleverify_reward/std": 0.48841196298599243, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2579.0, "completions/mean_length": 619.310302734375, "completions/mean_terminated_length": 615.4256591796875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 7.559766763848397, "grad_norm": 0.16937923431396484, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 531036948.0, "reward": 0.6540178656578064, "reward_std": 0.18201415240764618, "rewards/simpleverify_reward/mean": 0.6540178656578064, "rewards/simpleverify_reward/std": 0.4759531021118164, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2133.0, "completions/mean_length": 565.247802734375, "completions/mean_terminated_length": 557.3489990234375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 7.569096209912536, "grad_norm": 0.15105187892913818, "learning_rate": 1e-06, "loss": 0.0149, "num_tokens": 531645450.0, "reward": 0.6395089626312256, "reward_std": 0.15172751247882843, "rewards/simpleverify_reward/mean": 0.6395089030265808, "rewards/simpleverify_reward/std": 0.4804111421108246, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 629.404052734375, "completions/mean_terminated_length": 617.7581176757812, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 7.578425655976677, "grad_norm": 0.1495620161294937, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 532294948.0, "reward": 0.6651785969734192, "reward_std": 0.15751849114894867, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219160199165344, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3297.0, "completions/mean_length": 609.2433471679688, "completions/mean_terminated_length": 601.4429321289062, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 7.587755102040816, "grad_norm": 0.16023291647434235, "learning_rate": 1e-06, "loss": 0.016, "num_tokens": 532933806.0, "reward": 0.6194196939468384, "reward_std": 0.14710581302642822, "rewards/simpleverify_reward/mean": 0.6194196343421936, "rewards/simpleverify_reward/std": 0.48580074310302734, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3849.0, "completions/mean_length": 617.5324096679688, "completions/mean_terminated_length": 605.8466186523438, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 7.597084548104956, "grad_norm": 0.14159663021564484, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 533574571.0, "reward": 0.7109375596046448, "reward_std": 0.13399037718772888, "rewards/simpleverify_reward/mean": 0.7109375, "rewards/simpleverify_reward/std": 0.45358020067214966, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3733.0, "completions/mean_length": 550.5011596679688, "completions/mean_terminated_length": 522.5838012695312, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 7.606413994169096, "grad_norm": 0.16212429106235504, "learning_rate": 1e-06, "loss": 0.017, "num_tokens": 534156996.0, "reward": 0.7678571939468384, "reward_std": 0.14203977584838867, "rewards/simpleverify_reward/mean": 0.7678571343421936, "rewards/simpleverify_reward/std": 0.422435462474823, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2746.0, "completions/max_terminated_length": 2746.0, "completions/mean_length": 591.193115234375, "completions/mean_terminated_length": 591.193115234375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 7.615743440233236, "grad_norm": 0.14301511645317078, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 534772241.0, "reward": 0.65625, "reward_std": 0.11415030062198639, "rewards/simpleverify_reward/mean": 0.65625, "rewards/simpleverify_reward/std": 0.4752241373062134, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2653.0, "completions/mean_length": 614.739990234375, "completions/mean_terminated_length": 595.2042846679688, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 7.625072886297376, "grad_norm": 0.15634407103061676, "learning_rate": 1e-06, "loss": 0.0277, "num_tokens": 535409200.0, "reward": 0.6183035969734192, "reward_std": 0.1676262766122818, "rewards/simpleverify_reward/mean": 0.6183035969734192, "rewards/simpleverify_reward/std": 0.4860740303993225, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2946.0, "completions/mean_length": 592.841552734375, "completions/mean_terminated_length": 573.1829833984375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 7.634402332361516, "grad_norm": 0.13555946946144104, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 536026018.0, "reward": 0.6495535969734192, "reward_std": 0.12494204193353653, "rewards/simpleverify_reward/mean": 0.6495535969734192, "rewards/simpleverify_reward/std": 0.477376252412796, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3668.0, "completions/mean_length": 647.890625, "completions/mean_terminated_length": 605.0327758789062, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 7.643731778425656, "grad_norm": 0.25269901752471924, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 536697944.0, "reward": 0.6339285969734192, "reward_std": 0.1496206372976303, "rewards/simpleverify_reward/mean": 0.6339285969734192, "rewards/simpleverify_reward/std": 0.48199835419654846, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3037.0, "completions/mean_length": 620.6785888671875, "completions/mean_terminated_length": 609.0033569335938, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 7.653061224489796, "grad_norm": 0.1709447205066681, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 537351640.0, "reward": 0.6729910969734192, "reward_std": 0.1897144764661789, "rewards/simpleverify_reward/mean": 0.6729910969734192, "rewards/simpleverify_reward/std": 0.46938255429267883, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3865.0, "completions/mean_length": 564.7210083007812, "completions/mean_terminated_length": 544.9046020507812, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 7.662390670553936, "grad_norm": 0.14506109058856964, "learning_rate": 1e-06, "loss": 0.009, "num_tokens": 537940198.0, "reward": 0.715401828289032, "reward_std": 0.1164068877696991, "rewards/simpleverify_reward/mean": 0.7154017686843872, "rewards/simpleverify_reward/std": 0.4514748752117157, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3170.0, "completions/mean_length": 626.9989013671875, "completions/mean_terminated_length": 607.5320434570312, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 7.671720116618076, "grad_norm": 0.1770143210887909, "learning_rate": 1e-06, "loss": 0.0377, "num_tokens": 538581077.0, "reward": 0.6696428656578064, "reward_std": 0.19287119805812836, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.47060438990592957, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2509.0, "completions/mean_length": 594.2254638671875, "completions/mean_terminated_length": 590.3128051757812, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 7.681049562682215, "grad_norm": 0.15992018580436707, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 539209799.0, "reward": 0.6205357313156128, "reward_std": 0.16728109121322632, "rewards/simpleverify_reward/mean": 0.6205357313156128, "rewards/simpleverify_reward/std": 0.4855247139930725, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2885.0, "completions/mean_length": 592.6194458007812, "completions/mean_terminated_length": 565.0337524414062, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 7.690379008746356, "grad_norm": 0.17074398696422577, "learning_rate": 1e-06, "loss": 0.0346, "num_tokens": 539826898.0, "reward": 0.6796875596046448, "reward_std": 0.15924417972564697, "rewards/simpleverify_reward/mean": 0.6796875, "rewards/simpleverify_reward/std": 0.4668572247028351, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2819.0, "completions/mean_length": 589.1295166015625, "completions/mean_terminated_length": 577.3482666015625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 7.699708454810495, "grad_norm": 0.15542562305927277, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 540441702.0, "reward": 0.6205357313156128, "reward_std": 0.12346893548965454, "rewards/simpleverify_reward/mean": 0.6205357313156128, "rewards/simpleverify_reward/std": 0.4855247139930725, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 616.7288208007812, "completions/mean_terminated_length": 593.2730102539062, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 7.709037900874636, "grad_norm": 0.14675556123256683, "learning_rate": 1e-06, "loss": 0.0258, "num_tokens": 541089339.0, "reward": 0.6495535969734192, "reward_std": 0.13827505707740784, "rewards/simpleverify_reward/mean": 0.6495535969734192, "rewards/simpleverify_reward/std": 0.477376252412796, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4045.0, "completions/mean_length": 608.8002319335938, "completions/mean_terminated_length": 581.3419799804688, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 7.718367346938775, "grad_norm": 0.1464860439300537, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 541728936.0, "reward": 0.6383928656578064, "reward_std": 0.13929562270641327, "rewards/simpleverify_reward/mean": 0.6383928656578064, "rewards/simpleverify_reward/std": 0.4807341694831848, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3866.0, "completions/mean_length": 587.661865234375, "completions/mean_terminated_length": 567.9741821289062, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 7.727696793002916, "grad_norm": 0.16297224164009094, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 542351081.0, "reward": 0.676339328289032, "reward_std": 0.16825930774211884, "rewards/simpleverify_reward/mean": 0.6763392686843872, "rewards/simpleverify_reward/std": 0.4681335985660553, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2446.0, "completions/mean_length": 606.4029541015625, "completions/mean_terminated_length": 590.7545166015625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 7.737026239067055, "grad_norm": 0.15235655009746552, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 542987162.0, "reward": 0.6205357313156128, "reward_std": 0.13290520012378693, "rewards/simpleverify_reward/mean": 0.6205357313156128, "rewards/simpleverify_reward/std": 0.4855247139930725, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2779.0, "completions/mean_length": 586.419677734375, "completions/mean_terminated_length": 566.7250366210938, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 7.746355685131196, "grad_norm": 0.15495814383029938, "learning_rate": 1e-06, "loss": 0.0318, "num_tokens": 543602738.0, "reward": 0.7064732313156128, "reward_std": 0.13647380471229553, "rewards/simpleverify_reward/mean": 0.7064732313156128, "rewards/simpleverify_reward/std": 0.4556320011615753, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2200.0, "completions/mean_length": 604.0703125, "completions/mean_terminated_length": 580.5292358398438, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 7.755685131195335, "grad_norm": 0.17322348058223724, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 544236801.0, "reward": 0.6640625, "reward_std": 0.18085090816020966, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2446.0, "completions/mean_length": 633.5670166015625, "completions/mean_terminated_length": 606.3037109375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 7.765014577259475, "grad_norm": 0.11913415789604187, "learning_rate": 1e-06, "loss": 0.0149, "num_tokens": 544891213.0, "reward": 0.6707589626312256, "reward_std": 0.10025416314601898, "rewards/simpleverify_reward/mean": 0.6707589030265808, "rewards/simpleverify_reward/std": 0.4702001214027405, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3305.0, "completions/mean_length": 658.9308471679688, "completions/mean_terminated_length": 639.6431274414062, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 7.774344023323615, "grad_norm": 0.13648222386837006, "learning_rate": 1e-06, "loss": 0.0097, "num_tokens": 545563159.0, "reward": 0.5837053656578064, "reward_std": 0.12643907964229584, "rewards/simpleverify_reward/mean": 0.5837053656578064, "rewards/simpleverify_reward/std": 0.49321892857551575, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2458.0, "completions/mean_length": 560.0703125, "completions/mean_terminated_length": 556.1195068359375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 7.783673469387755, "grad_norm": 0.15224404633045197, "learning_rate": 1e-06, "loss": 0.0098, "num_tokens": 546153174.0, "reward": 0.6584821939468384, "reward_std": 0.13583509624004364, "rewards/simpleverify_reward/mean": 0.6584821343421936, "rewards/simpleverify_reward/std": 0.4744836091995239, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2036.0, "completions/max_terminated_length": 2036.0, "completions/mean_length": 542.6819458007812, "completions/mean_terminated_length": 542.6819458007812, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 7.793002915451895, "grad_norm": 0.14624527096748352, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 546736513.0, "reward": 0.7265625596046448, "reward_std": 0.13760314881801605, "rewards/simpleverify_reward/mean": 0.7265625, "rewards/simpleverify_reward/std": 0.4459724426269531, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3126.0, "completions/mean_length": 633.4542846679688, "completions/mean_terminated_length": 617.9271850585938, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 7.802332361516035, "grad_norm": 0.14659970998764038, "learning_rate": 1e-06, "loss": 0.0192, "num_tokens": 547396728.0, "reward": 0.6383928656578064, "reward_std": 0.12283840030431747, "rewards/simpleverify_reward/mean": 0.6383928656578064, "rewards/simpleverify_reward/std": 0.4807341992855072, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3734.0, "completions/mean_length": 570.9710083007812, "completions/mean_terminated_length": 563.0850219726562, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 7.811661807580175, "grad_norm": 0.16410227119922638, "learning_rate": 1e-06, "loss": 0.0159, "num_tokens": 548008990.0, "reward": 0.6629464626312256, "reward_std": 0.13993361592292786, "rewards/simpleverify_reward/mean": 0.6629464030265808, "rewards/simpleverify_reward/std": 0.47296738624572754, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3422.0, "completions/mean_length": 621.4576416015625, "completions/mean_terminated_length": 613.6845703125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 7.820991253644315, "grad_norm": 0.13715656101703644, "learning_rate": 1e-06, "loss": 0.0264, "num_tokens": 548651720.0, "reward": 0.6964285969734192, "reward_std": 0.12580150365829468, "rewards/simpleverify_reward/mean": 0.6964285969734192, "rewards/simpleverify_reward/std": 0.4600565731525421, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3953.0, "completions/mean_length": 598.5658569335938, "completions/mean_terminated_length": 574.9876708984375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 7.830320699708455, "grad_norm": 0.14342063665390015, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 549275179.0, "reward": 0.6741071939468384, "reward_std": 0.11163367331027985, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 540.4129638671875, "completions/mean_terminated_length": 536.440185546875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 7.839650145772595, "grad_norm": 0.16575446724891663, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 549854669.0, "reward": 0.6741071939468384, "reward_std": 0.156651571393013, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2328.0, "completions/mean_length": 602.333740234375, "completions/mean_terminated_length": 586.6670532226562, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 7.848979591836734, "grad_norm": 0.1419413685798645, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 550479240.0, "reward": 0.6729910969734192, "reward_std": 0.12335974723100662, "rewards/simpleverify_reward/mean": 0.6729910969734192, "rewards/simpleverify_reward/std": 0.46938255429267883, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2942.0, "completions/mean_length": 530.6473388671875, "completions/mean_terminated_length": 518.669677734375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 7.858309037900875, "grad_norm": 0.17542192339897156, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 551052100.0, "reward": 0.7209821939468384, "reward_std": 0.16645875573158264, "rewards/simpleverify_reward/mean": 0.7209821343421936, "rewards/simpleverify_reward/std": 0.448766827583313, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3964.0, "completions/mean_length": 617.6473388671875, "completions/mean_terminated_length": 613.7608642578125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 7.867638483965014, "grad_norm": 0.15278922021389008, "learning_rate": 1e-06, "loss": 0.0098, "num_tokens": 551690080.0, "reward": 0.6584821939468384, "reward_std": 0.1465412974357605, "rewards/simpleverify_reward/mean": 0.6584821343421936, "rewards/simpleverify_reward/std": 0.4744836091995239, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2209.0, "completions/mean_length": 545.8471069335938, "completions/mean_terminated_length": 537.9049072265625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 7.876967930029155, "grad_norm": 0.15382611751556396, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 552268327.0, "reward": 0.6808035969734192, "reward_std": 0.12392286956310272, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.46642565727233887, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2174.0, "completions/mean_length": 531.0457763671875, "completions/mean_terminated_length": 523.0704956054688, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 7.886297376093294, "grad_norm": 0.16686837375164032, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 552827296.0, "reward": 0.7254464626312256, "reward_std": 0.13703832030296326, "rewards/simpleverify_reward/mean": 0.7254464030265808, "rewards/simpleverify_reward/std": 0.4465382993221283, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2056.0, "completions/mean_length": 586.1027221679688, "completions/mean_terminated_length": 566.4063110351562, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 7.895626822157435, "grad_norm": 0.15240783989429474, "learning_rate": 1e-06, "loss": 0.0205, "num_tokens": 553449796.0, "reward": 0.6707589626312256, "reward_std": 0.1407923698425293, "rewards/simpleverify_reward/mean": 0.6707589030265808, "rewards/simpleverify_reward/std": 0.4702001214027405, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2127.0, "completions/mean_length": 586.6049194335938, "completions/mean_terminated_length": 562.946044921875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 7.904956268221574, "grad_norm": 0.157833531498909, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 554072058.0, "reward": 0.6852678656578064, "reward_std": 0.14477255940437317, "rewards/simpleverify_reward/mean": 0.6852678656578064, "rewards/simpleverify_reward/std": 0.46466848254203796, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 594.5346069335938, "completions/mean_terminated_length": 582.7716064453125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 7.914285714285715, "grad_norm": 0.1535460650920868, "learning_rate": 1e-06, "loss": 0.0192, "num_tokens": 554705593.0, "reward": 0.6930803656578064, "reward_std": 0.1679643839597702, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147337555885315, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 610.3828125, "completions/mean_terminated_length": 567.0587768554688, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 7.923615160349854, "grad_norm": 0.14707285165786743, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 555344024.0, "reward": 0.6473214626312256, "reward_std": 0.12587565183639526, "rewards/simpleverify_reward/mean": 0.6473214030265808, "rewards/simpleverify_reward/std": 0.47807058691978455, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2443.0, "completions/mean_length": 578.7645263671875, "completions/mean_terminated_length": 574.8345947265625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 7.932944606413994, "grad_norm": 0.1496608853340149, "learning_rate": 1e-06, "loss": 0.0045, "num_tokens": 555953829.0, "reward": 0.6674107313156128, "reward_std": 0.15315786004066467, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3210.0, "completions/mean_length": 627.3995971679688, "completions/mean_terminated_length": 607.9349365234375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 7.942274052478134, "grad_norm": 0.17803548276424408, "learning_rate": 1e-06, "loss": 0.0349, "num_tokens": 556610755.0, "reward": 0.6495535969734192, "reward_std": 0.18814215064048767, "rewards/simpleverify_reward/mean": 0.6495535969734192, "rewards/simpleverify_reward/std": 0.477376252412796, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3389.0, "completions/mean_length": 569.6585083007812, "completions/mean_terminated_length": 553.8453369140625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 7.9516034985422746, "grad_norm": 0.16748349368572235, "learning_rate": 1e-06, "loss": 0.0131, "num_tokens": 557207321.0, "reward": 0.6495535969734192, "reward_std": 0.1640915721654892, "rewards/simpleverify_reward/mean": 0.6495535969734192, "rewards/simpleverify_reward/std": 0.477376252412796, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2558.0, "completions/mean_length": 654.6551513671875, "completions/mean_terminated_length": 627.5579833984375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 7.960932944606414, "grad_norm": 0.16408587992191315, "learning_rate": 1e-06, "loss": 0.0288, "num_tokens": 557882708.0, "reward": 0.629464328289032, "reward_std": 0.1688220202922821, "rewards/simpleverify_reward/mean": 0.6294642686843872, "rewards/simpleverify_reward/std": 0.4832179844379425, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2580.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 578.3248291015625, "completions/mean_terminated_length": 578.3248291015625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 7.970262390670554, "grad_norm": 0.1689259260892868, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 558481759.0, "reward": 0.7332589626312256, "reward_std": 0.159840390086174, "rewards/simpleverify_reward/mean": 0.7332589030265808, "rewards/simpleverify_reward/std": 0.4425028860569, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3899.0, "completions/mean_length": 610.9308471679688, "completions/mean_terminated_length": 559.6217041015625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 7.979591836734694, "grad_norm": 0.1537453532218933, "learning_rate": 1e-06, "loss": 0.031, "num_tokens": 559114601.0, "reward": 0.7131696939468384, "reward_std": 0.13699443638324738, "rewards/simpleverify_reward/mean": 0.7131696343421936, "rewards/simpleverify_reward/std": 0.4525342881679535, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4027.0, "completions/mean_length": 637.927490234375, "completions/mean_terminated_length": 587.015869140625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 7.988921282798834, "grad_norm": 0.1421864628791809, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 559787296.0, "reward": 0.6272321939468384, "reward_std": 0.14676883816719055, "rewards/simpleverify_reward/mean": 0.6272321343421936, "rewards/simpleverify_reward/std": 0.4838111698627472, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0028409090909090606, "completions/max_length": 4096.0, "completions/max_terminated_length": 3793.0, "completions/mean_length": 680.9034423828125, "completions/mean_terminated_length": 671.1737670898438, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 7.998250728862974, "grad_norm": 0.15753310918807983, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 560395942.0, "reward": 0.6506696939468384, "reward_std": 0.13189560174942017, "rewards/simpleverify_reward/mean": 0.6506696343421936, "rewards/simpleverify_reward/std": 0.47702476382255554, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2960.0, "completions/mean_length": 613.59375, "completions/mean_terminated_length": 597.9776000976562, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 8.00932944606414, "grad_norm": 0.14500373601913452, "learning_rate": 1e-06, "loss": 0.0181, "num_tokens": 561040138.0, "reward": 0.6506696939468384, "reward_std": 0.14394839107990265, "rewards/simpleverify_reward/mean": 0.6506696343421936, "rewards/simpleverify_reward/std": 0.47702476382255554, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2187.0, "completions/mean_length": 576.4464721679688, "completions/mean_terminated_length": 568.5726928710938, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 8.018658892128279, "grad_norm": 0.1595386564731598, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 561652490.0, "reward": 0.6930803656578064, "reward_std": 0.14699846506118774, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147337555885315, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3220.0, "completions/mean_length": 626.7957763671875, "completions/mean_terminated_length": 611.2388305664062, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 8.02798833819242, "grad_norm": 0.14593984186649323, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 562298691.0, "reward": 0.6573660969734192, "reward_std": 0.11708837747573853, "rewards/simpleverify_reward/mean": 0.6573660969734192, "rewards/simpleverify_reward/std": 0.47485533356666565, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2730.0, "completions/mean_length": 608.8303833007812, "completions/mean_terminated_length": 604.93408203125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 8.03731778425656, "grad_norm": 0.14318211376667023, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 562924771.0, "reward": 0.668526828289032, "reward_std": 0.1348252147436142, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3219.0, "completions/mean_length": 568.1395263671875, "completions/mean_terminated_length": 556.287841796875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 8.0466472303207, "grad_norm": 0.159093976020813, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 563518872.0, "reward": 0.6863839626312256, "reward_std": 0.13789597153663635, "rewards/simpleverify_reward/mean": 0.6863839030265808, "rewards/simpleverify_reward/std": 0.46422141790390015, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 576.2299194335938, "completions/mean_terminated_length": 564.4053955078125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 8.055976676384839, "grad_norm": 0.15144293010234833, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 564116606.0, "reward": 0.6964285969734192, "reward_std": 0.14166000485420227, "rewards/simpleverify_reward/mean": 0.6964285969734192, "rewards/simpleverify_reward/std": 0.4600565731525421, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3810.0, "completions/mean_length": 574.5904541015625, "completions/mean_terminated_length": 566.7125244140625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 8.06530612244898, "grad_norm": 0.1436198353767395, "learning_rate": 1e-06, "loss": 0.006, "num_tokens": 564735775.0, "reward": 0.6741071939468384, "reward_std": 0.1322953701019287, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692258834839, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 605.1428833007812, "completions/mean_terminated_length": 577.6558227539062, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 8.07463556851312, "grad_norm": 0.16125892102718353, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 565363871.0, "reward": 0.6930803656578064, "reward_std": 0.1350441575050354, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147337555885315, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1937.0, "completions/mean_length": 554.7332763671875, "completions/mean_terminated_length": 546.8109741210938, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 8.08396501457726, "grad_norm": 0.18224909901618958, "learning_rate": 1e-06, "loss": 0.017, "num_tokens": 565958336.0, "reward": 0.6573660969734192, "reward_std": 0.15357011556625366, "rewards/simpleverify_reward/mean": 0.6573660969734192, "rewards/simpleverify_reward/std": 0.47485533356666565, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 540.6060791015625, "completions/mean_terminated_length": 532.652099609375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 8.093294460641399, "grad_norm": 0.15814852714538574, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 566528359.0, "reward": 0.6808035969734192, "reward_std": 0.12197147309780121, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.4664256274700165, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 621.4174194335938, "completions/mean_terminated_length": 605.8363647460938, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 8.102623906705539, "grad_norm": 0.15355809032917023, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 567165853.0, "reward": 0.6383928656578064, "reward_std": 0.13508442044258118, "rewards/simpleverify_reward/mean": 0.6383928656578064, "rewards/simpleverify_reward/std": 0.4807341992855072, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2540.0, "completions/mean_length": 611.0881958007812, "completions/mean_terminated_length": 575.728271484375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 8.11195335276968, "grad_norm": 0.16700363159179688, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 567811188.0, "reward": 0.7120535969734192, "reward_std": 0.14951187372207642, "rewards/simpleverify_reward/mean": 0.7120535969734192, "rewards/simpleverify_reward/std": 0.4530589282512665, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2557.0, "completions/mean_length": 578.7489013671875, "completions/mean_terminated_length": 562.9765014648438, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 8.12128279883382, "grad_norm": 0.1624540388584137, "learning_rate": 1e-06, "loss": 0.0255, "num_tokens": 568411907.0, "reward": 0.6964285969734192, "reward_std": 0.14301547408103943, "rewards/simpleverify_reward/mean": 0.6964285969734192, "rewards/simpleverify_reward/std": 0.4600566029548645, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3490.0, "completions/mean_length": 565.872802734375, "completions/mean_terminated_length": 561.928466796875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 8.130612244897959, "grad_norm": 0.14048127830028534, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 569004313.0, "reward": 0.6484375, "reward_std": 0.11054597795009613, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3524.0, "completions/mean_length": 598.2522583007812, "completions/mean_terminated_length": 578.6240234375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 8.139941690962099, "grad_norm": 0.14932768046855927, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 569635659.0, "reward": 0.6863839626312256, "reward_std": 0.11468162387609482, "rewards/simpleverify_reward/mean": 0.6863839030265808, "rewards/simpleverify_reward/std": 0.46422144770622253, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2693.0, "completions/mean_length": 576.1886596679688, "completions/mean_terminated_length": 544.4786376953125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 8.14927113702624, "grad_norm": 0.14871780574321747, "learning_rate": 1e-06, "loss": 0.0207, "num_tokens": 570248324.0, "reward": 0.6897321939468384, "reward_std": 0.13091033697128296, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 570.2288208007812, "completions/mean_terminated_length": 550.443359375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 8.15860058309038, "grad_norm": 0.18397927284240723, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 570851961.0, "reward": 0.7198660969734192, "reward_std": 0.16555652022361755, "rewards/simpleverify_reward/mean": 0.7198660969734192, "rewards/simpleverify_reward/std": 0.44931527972221375, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3565.0, "completions/mean_length": 618.575927734375, "completions/mean_terminated_length": 606.8936157226562, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 8.167930029154519, "grad_norm": 0.1665138155221939, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 571498765.0, "reward": 0.6517857313156128, "reward_std": 0.15454654395580292, "rewards/simpleverify_reward/mean": 0.6517857313156128, "rewards/simpleverify_reward/std": 0.47667041420936584, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3831.0, "completions/mean_length": 579.6417846679688, "completions/mean_terminated_length": 555.9359741210938, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 8.177259475218658, "grad_norm": 0.17420698702335358, "learning_rate": 1e-06, "loss": 0.0263, "num_tokens": 572098620.0, "reward": 0.7042410969734192, "reward_std": 0.1504140943288803, "rewards/simpleverify_reward/mean": 0.7042410969734192, "rewards/simpleverify_reward/std": 0.45663803815841675, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 4042.0, "completions/mean_length": 513.2890625, "completions/mean_terminated_length": 505.2740478515625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 8.186588921282798, "grad_norm": 0.1515740007162094, "learning_rate": 1e-06, "loss": 0.016, "num_tokens": 572651055.0, "reward": 0.715401828289032, "reward_std": 0.1155831515789032, "rewards/simpleverify_reward/mean": 0.7154017686843872, "rewards/simpleverify_reward/std": 0.4514748752117157, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2529.0, "completions/mean_length": 561.5279541015625, "completions/mean_terminated_length": 557.5787353515625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 8.19591836734694, "grad_norm": 0.18121260404586792, "learning_rate": 1e-06, "loss": 0.0034, "num_tokens": 573253904.0, "reward": 0.7042410969734192, "reward_std": 0.14376293122768402, "rewards/simpleverify_reward/mean": 0.7042410969734192, "rewards/simpleverify_reward/std": 0.45663803815841675, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 595.3248291015625, "completions/mean_terminated_length": 591.4133911132812, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 8.205247813411079, "grad_norm": 0.13830621540546417, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 573880635.0, "reward": 0.6618303656578064, "reward_std": 0.13016174733638763, "rewards/simpleverify_reward/mean": 0.6618303656578064, "rewards/simpleverify_reward/std": 0.4733508229255676, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3576.0, "completions/mean_length": 578.2890625, "completions/mean_terminated_length": 562.5145874023438, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 8.214577259475218, "grad_norm": 0.13098640739917755, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 574491158.0, "reward": 0.6852678656578064, "reward_std": 0.09792228788137436, "rewards/simpleverify_reward/mean": 0.6852678656578064, "rewards/simpleverify_reward/std": 0.4646684527397156, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2869.0, "completions/mean_length": 619.3984375, "completions/mean_terminated_length": 603.808349609375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 8.223906705539358, "grad_norm": 0.1642908751964569, "learning_rate": 1e-06, "loss": 0.0189, "num_tokens": 575135291.0, "reward": 0.6205357313156128, "reward_std": 0.17585043609142303, "rewards/simpleverify_reward/mean": 0.6205357313156128, "rewards/simpleverify_reward/std": 0.4855247139930725, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2614.0, "completions/max_terminated_length": 2614.0, "completions/mean_length": 552.9319458007812, "completions/mean_terminated_length": 552.9319458007812, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 8.2332361516035, "grad_norm": 0.17215660214424133, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 575726142.0, "reward": 0.6551339626312256, "reward_std": 0.17430990934371948, "rewards/simpleverify_reward/mean": 0.6551339030265808, "rewards/simpleverify_reward/std": 0.4755900800228119, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3234.0, "completions/mean_length": 592.6082763671875, "completions/mean_terminated_length": 580.8387451171875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 8.242565597667639, "grad_norm": 0.15121428668498993, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 576360631.0, "reward": 0.6395089626312256, "reward_std": 0.13978347182273865, "rewards/simpleverify_reward/mean": 0.6395089030265808, "rewards/simpleverify_reward/std": 0.4804111123085022, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 563.427490234375, "completions/mean_terminated_length": 559.4804077148438, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 8.251895043731778, "grad_norm": 0.15599484741687775, "learning_rate": 1e-06, "loss": 0.0165, "num_tokens": 576955174.0, "reward": 0.6417410969734192, "reward_std": 0.12554007768630981, "rewards/simpleverify_reward/mean": 0.6417410969734192, "rewards/simpleverify_reward/std": 0.47975656390190125, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 576.2667846679688, "completions/mean_terminated_length": 564.4423217773438, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 8.261224489795918, "grad_norm": 0.13367347419261932, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 577554309.0, "reward": 0.6696428656578064, "reward_std": 0.10637258738279343, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.47060438990592957, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3279.0, "completions/mean_length": 580.9699096679688, "completions/mean_terminated_length": 565.2073974609375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 8.270553935860057, "grad_norm": 0.16119250655174255, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 578172298.0, "reward": 0.6785714626312256, "reward_std": 0.13373145461082458, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3993.0, "completions/mean_length": 608.3170166015625, "completions/mean_terminated_length": 592.6771850585938, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 8.279883381924199, "grad_norm": 0.1668153554201126, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 578808134.0, "reward": 0.6674107313156128, "reward_std": 0.13978388905525208, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 563.2745971679688, "completions/mean_terminated_length": 563.2745971679688, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 8.289212827988338, "grad_norm": 0.1796666979789734, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 579397364.0, "reward": 0.6975446939468384, "reward_std": 0.17543534934520721, "rewards/simpleverify_reward/mean": 0.6975446343421936, "rewards/simpleverify_reward/std": 0.45957791805267334, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2290.0, "completions/mean_length": 587.4631958007812, "completions/mean_terminated_length": 563.8101196289062, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 8.298542274052478, "grad_norm": 0.16573219001293182, "learning_rate": 1e-06, "loss": 0.0249, "num_tokens": 580013731.0, "reward": 0.6651785969734192, "reward_std": 0.1388823539018631, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219157218933105, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 573.607177734375, "completions/mean_terminated_length": 553.8406372070312, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 8.307871720116617, "grad_norm": 0.1615709364414215, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 580614171.0, "reward": 0.7276785969734192, "reward_std": 0.14094531536102295, "rewards/simpleverify_reward/mean": 0.7276785969734192, "rewards/simpleverify_reward/std": 0.4454030692577362, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 606.732177734375, "completions/mean_terminated_length": 579.2576293945312, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 8.317201166180759, "grad_norm": 0.15659677982330322, "learning_rate": 1e-06, "loss": 0.0176, "num_tokens": 581251395.0, "reward": 0.6986607313156128, "reward_std": 0.1498495638370514, "rewards/simpleverify_reward/mean": 0.6986607313156128, "rewards/simpleverify_reward/std": 0.4590960443019867, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 567.859375, "completions/mean_terminated_length": 563.9172973632812, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 8.326530612244898, "grad_norm": 0.14878599345684052, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 581856141.0, "reward": 0.6741071939468384, "reward_std": 0.14188753068447113, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2265.0, "completions/mean_length": 561.7522583007812, "completions/mean_terminated_length": 529.9121704101562, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 8.335860058309038, "grad_norm": 0.14907006919384003, "learning_rate": 1e-06, "loss": 0.0068, "num_tokens": 582447455.0, "reward": 0.65625, "reward_std": 0.13647449016571045, "rewards/simpleverify_reward/mean": 0.65625, "rewards/simpleverify_reward/std": 0.4752241373062134, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 579.075927734375, "completions/mean_terminated_length": 571.2080688476562, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 8.345189504373177, "grad_norm": 0.16779015958309174, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 583047443.0, "reward": 0.6573660969734192, "reward_std": 0.16480834782123566, "rewards/simpleverify_reward/mean": 0.6573660969734192, "rewards/simpleverify_reward/std": 0.47485533356666565, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2878.0, "completions/mean_length": 575.5592041015625, "completions/mean_terminated_length": 551.8258666992188, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 8.354518950437317, "grad_norm": 0.1402091085910797, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 583656640.0, "reward": 0.6808035969734192, "reward_std": 0.09525452554225922, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.4664256274700165, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4013.0, "completions/mean_length": 637.4453125, "completions/mean_terminated_length": 618.0370483398438, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 8.363848396501458, "grad_norm": 0.16806653141975403, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 584311263.0, "reward": 0.645089328289032, "reward_std": 0.1503046303987503, "rewards/simpleverify_reward/mean": 0.6450892686843872, "rewards/simpleverify_reward/std": 0.4787535071372986, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 578.8214721679688, "completions/mean_terminated_length": 574.8916015625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 8.373177842565598, "grad_norm": 0.1536044478416443, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 584913271.0, "reward": 0.7120535969734192, "reward_std": 0.1328631341457367, "rewards/simpleverify_reward/mean": 0.7120535969734192, "rewards/simpleverify_reward/std": 0.4530588984489441, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2056.0, "completions/mean_length": 535.3147583007812, "completions/mean_terminated_length": 523.352783203125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 8.382507288629737, "grad_norm": 0.16782277822494507, "learning_rate": 1e-06, "loss": 0.0384, "num_tokens": 585493633.0, "reward": 0.7042410969734192, "reward_std": 0.15394991636276245, "rewards/simpleverify_reward/mean": 0.7042410969734192, "rewards/simpleverify_reward/std": 0.45663806796073914, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3448.0, "completions/mean_length": 575.0, "completions/mean_terminated_length": 567.123046875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 8.391836734693877, "grad_norm": 0.17557744681835175, "learning_rate": 1e-06, "loss": 0.0236, "num_tokens": 586085849.0, "reward": 0.746651828289032, "reward_std": 0.1441759467124939, "rewards/simpleverify_reward/mean": 0.7466517686843872, "rewards/simpleverify_reward/std": 0.435171514749527, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 1851.0, "completions/mean_length": 523.7667846679688, "completions/mean_terminated_length": 515.775146484375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 8.401166180758018, "grad_norm": 0.15878252685070038, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 586647248.0, "reward": 0.6941964626312256, "reward_std": 0.11355438083410263, "rewards/simpleverify_reward/mean": 0.6941964030265808, "rewards/simpleverify_reward/std": 0.4610042870044708, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3647.0, "completions/mean_length": 562.8917846679688, "completions/mean_terminated_length": 547.0482177734375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 8.410495626822158, "grad_norm": 0.18212451040744781, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 587231999.0, "reward": 0.754464328289032, "reward_std": 0.15537138283252716, "rewards/simpleverify_reward/mean": 0.7544642686843872, "rewards/simpleverify_reward/std": 0.43064478039741516, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 582.65625, "completions/mean_terminated_length": 578.730712890625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 8.419825072886297, "grad_norm": 0.16127997636795044, "learning_rate": 1e-06, "loss": 0.0228, "num_tokens": 587837179.0, "reward": 0.7142857313156128, "reward_std": 0.15165898203849792, "rewards/simpleverify_reward/mean": 0.7142857313156128, "rewards/simpleverify_reward/std": 0.4520062506198883, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3950.0, "completions/mean_length": 569.0658569335938, "completions/mean_terminated_length": 557.21728515625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 8.429154518950437, "grad_norm": 0.14924167096614838, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 588428190.0, "reward": 0.7745535969734192, "reward_std": 0.12332694232463837, "rewards/simpleverify_reward/mean": 0.7745535969734192, "rewards/simpleverify_reward/std": 0.41810935735702515, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2326.0, "completions/mean_length": 562.9386596679688, "completions/mean_terminated_length": 555.03466796875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 8.438483965014576, "grad_norm": 0.15051108598709106, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 589020015.0, "reward": 0.7131696939468384, "reward_std": 0.1191510558128357, "rewards/simpleverify_reward/mean": 0.7131696343421936, "rewards/simpleverify_reward/std": 0.4525342881679535, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2986.0, "completions/mean_length": 616.8995971679688, "completions/mean_terminated_length": 605.211669921875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 8.447813411078718, "grad_norm": 0.12895597517490387, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 589669557.0, "reward": 0.676339328289032, "reward_std": 0.10644815862178802, "rewards/simpleverify_reward/mean": 0.6763392686843872, "rewards/simpleverify_reward/std": 0.4681335985660553, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2883.0, "completions/mean_length": 590.0546875, "completions/mean_terminated_length": 570.3804931640625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 8.457142857142857, "grad_norm": 0.16909345984458923, "learning_rate": 1e-06, "loss": 0.0232, "num_tokens": 590294110.0, "reward": 0.715401828289032, "reward_std": 0.15957936644554138, "rewards/simpleverify_reward/mean": 0.7154017686843872, "rewards/simpleverify_reward/std": 0.4514748752117157, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3728.0, "completions/mean_length": 615.4230346679688, "completions/mean_terminated_length": 584.0664672851562, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 8.466472303206997, "grad_norm": 0.1421533077955246, "learning_rate": 1e-06, "loss": 0.0216, "num_tokens": 590931449.0, "reward": 0.7220982313156128, "reward_std": 0.1317012757062912, "rewards/simpleverify_reward/mean": 0.7220982313156128, "rewards/simpleverify_reward/std": 0.44821491837501526, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 645.443115234375, "completions/mean_terminated_length": 614.3569946289062, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 8.475801749271136, "grad_norm": 0.14956405758857727, "learning_rate": 1e-06, "loss": 0.0134, "num_tokens": 591595214.0, "reward": 0.6785714626312256, "reward_std": 0.13087712228298187, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2746.0, "completions/mean_length": 655.6194458007812, "completions/mean_terminated_length": 640.1917114257812, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 8.485131195335278, "grad_norm": 0.13595646619796753, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 592265881.0, "reward": 0.6595982313156128, "reward_std": 0.12816406786441803, "rewards/simpleverify_reward/mean": 0.6595982313156128, "rewards/simpleverify_reward/std": 0.4741089344024658, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3784.0, "completions/mean_length": 589.4397583007812, "completions/mean_terminated_length": 565.7999877929688, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 8.494460641399417, "grad_norm": 0.17461515963077545, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 592880075.0, "reward": 0.6875000596046448, "reward_std": 0.16420286893844604, "rewards/simpleverify_reward/mean": 0.6875, "rewards/simpleverify_reward/std": 0.4637712836265564, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3195.0, "completions/mean_length": 609.3147583007812, "completions/mean_terminated_length": 601.5145263671875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 8.503790087463557, "grad_norm": 0.15530502796173096, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 593526085.0, "reward": 0.6741071939468384, "reward_std": 0.12685278058052063, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692258834839, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 513.3549194335938, "completions/mean_terminated_length": 505.3400573730469, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 8.513119533527696, "grad_norm": 0.17512640357017517, "learning_rate": 1e-06, "loss": 0.0201, "num_tokens": 594074907.0, "reward": 0.7254464626312256, "reward_std": 0.137115016579628, "rewards/simpleverify_reward/mean": 0.7254464030265808, "rewards/simpleverify_reward/std": 0.4465383291244507, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 653.357177734375, "completions/mean_terminated_length": 626.249755859375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 8.522448979591836, "grad_norm": 0.14521123468875885, "learning_rate": 1e-06, "loss": 0.0256, "num_tokens": 594754515.0, "reward": 0.578125, "reward_std": 0.13012643158435822, "rewards/simpleverify_reward/mean": 0.578125, "rewards/simpleverify_reward/std": 0.4941346049308777, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3863.0, "completions/mean_length": 672.3158569335938, "completions/mean_terminated_length": 617.9716796875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 8.531778425655977, "grad_norm": 0.15443430840969086, "learning_rate": 1e-06, "loss": 0.0366, "num_tokens": 595442198.0, "reward": 0.6718750596046448, "reward_std": 0.14969871938228607, "rewards/simpleverify_reward/mean": 0.671875, "rewards/simpleverify_reward/std": 0.46979284286499023, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3721.0, "completions/mean_length": 555.3772583007812, "completions/mean_terminated_length": 551.4212036132812, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 8.541107871720117, "grad_norm": 0.1517896056175232, "learning_rate": 1e-06, "loss": 0.0181, "num_tokens": 596034080.0, "reward": 0.7287946939468384, "reward_std": 0.11475691199302673, "rewards/simpleverify_reward/mean": 0.7287946343421936, "rewards/simpleverify_reward/std": 0.44483017921447754, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2190.0, "completions/max_terminated_length": 2190.0, "completions/mean_length": 561.34375, "completions/mean_terminated_length": 561.34375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 8.550437317784256, "grad_norm": 0.16738541424274445, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 596625620.0, "reward": 0.7321428656578064, "reward_std": 0.14511913061141968, "rewards/simpleverify_reward/mean": 0.7321428656578064, "rewards/simpleverify_reward/std": 0.4430900514125824, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3990.0, "completions/mean_length": 678.5614013671875, "completions/mean_terminated_length": 659.3838500976562, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 8.559766763848396, "grad_norm": 0.14112596213817596, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 597332627.0, "reward": 0.6015625, "reward_std": 0.13256819546222687, "rewards/simpleverify_reward/mean": 0.6015625, "rewards/simpleverify_reward/std": 0.48984986543655396, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4068.0, "completions/mean_length": 652.765625, "completions/mean_terminated_length": 606.02490234375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 8.569096209912537, "grad_norm": 0.1431453675031662, "learning_rate": 1e-06, "loss": 0.0204, "num_tokens": 598005913.0, "reward": 0.6484375, "reward_std": 0.1365058869123459, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2154.0, "completions/mean_length": 587.4676513671875, "completions/mean_terminated_length": 575.6808471679688, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 8.578425655976677, "grad_norm": 0.15039686858654022, "learning_rate": 1e-06, "loss": 0.0104, "num_tokens": 598626260.0, "reward": 0.6506696939468384, "reward_std": 0.1321898251771927, "rewards/simpleverify_reward/mean": 0.6506696343421936, "rewards/simpleverify_reward/std": 0.47702476382255554, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 574.90625, "completions/mean_terminated_length": 559.1166381835938, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 8.587755102040816, "grad_norm": 0.20388759672641754, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 599236896.0, "reward": 0.7265625596046448, "reward_std": 0.1549587994813919, "rewards/simpleverify_reward/mean": 0.7265625, "rewards/simpleverify_reward/std": 0.4459724426269531, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2338.0, "completions/mean_length": 633.372802734375, "completions/mean_terminated_length": 613.941650390625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 8.597084548104956, "grad_norm": 0.14829044044017792, "learning_rate": 1e-06, "loss": 0.0281, "num_tokens": 599887126.0, "reward": 0.668526828289032, "reward_std": 0.13842660188674927, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 601.5725708007812, "completions/mean_terminated_length": 578.0145874023438, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 8.606413994169095, "grad_norm": 0.1547084003686905, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 600508815.0, "reward": 0.6886160969734192, "reward_std": 0.12297996878623962, "rewards/simpleverify_reward/mean": 0.6886160969734192, "rewards/simpleverify_reward/std": 0.46331799030303955, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 609.1529541015625, "completions/mean_terminated_length": 589.5858764648438, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 8.615743440233237, "grad_norm": 0.15306076407432556, "learning_rate": 1e-06, "loss": 0.0265, "num_tokens": 601138728.0, "reward": 0.652901828289032, "reward_std": 0.1529282033443451, "rewards/simpleverify_reward/mean": 0.6529017686843872, "rewards/simpleverify_reward/std": 0.47631320357322693, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3372.0, "completions/mean_length": 602.8359375, "completions/mean_terminated_length": 595.021240234375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 8.625072886297376, "grad_norm": 0.15863120555877686, "learning_rate": 1e-06, "loss": 0.0218, "num_tokens": 601775053.0, "reward": 0.6897321939468384, "reward_std": 0.1458340734243393, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 554.5670166015625, "completions/mean_terminated_length": 546.644287109375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 8.634402332361516, "grad_norm": 0.18039993941783905, "learning_rate": 1e-06, "loss": 0.0188, "num_tokens": 602364265.0, "reward": 0.6908482313156128, "reward_std": 0.1196063980460167, "rewards/simpleverify_reward/mean": 0.6908482313156128, "rewards/simpleverify_reward/std": 0.46240198612213135, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3733.0, "completions/mean_length": 612.1004638671875, "completions/mean_terminated_length": 596.4776000976562, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 8.643731778425655, "grad_norm": 0.14505524933338165, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 603004683.0, "reward": 0.6741071939468384, "reward_std": 0.12610459327697754, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.46896928548812866, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 582.9754638671875, "completions/mean_terminated_length": 563.2615356445312, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 8.653061224489797, "grad_norm": 0.17166797816753387, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 603614909.0, "reward": 0.7109375596046448, "reward_std": 0.15113016963005066, "rewards/simpleverify_reward/mean": 0.7109375, "rewards/simpleverify_reward/std": 0.45358020067214966, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3343.0, "completions/mean_length": 548.4765625, "completions/mean_terminated_length": 544.5128173828125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 8.662390670553936, "grad_norm": 0.1508386731147766, "learning_rate": 1e-06, "loss": 0.0152, "num_tokens": 604197560.0, "reward": 0.6729910969734192, "reward_std": 0.10742567479610443, "rewards/simpleverify_reward/mean": 0.6729910969734192, "rewards/simpleverify_reward/std": 0.46938255429267883, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3402.0, "completions/mean_length": 638.4944458007812, "completions/mean_terminated_length": 607.3457641601562, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 8.671720116618076, "grad_norm": 0.15306919813156128, "learning_rate": 1e-06, "loss": 0.0276, "num_tokens": 604858755.0, "reward": 0.6205357313156128, "reward_std": 0.1465412974357605, "rewards/simpleverify_reward/mean": 0.6205357313156128, "rewards/simpleverify_reward/std": 0.4855247139930725, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 620.0870971679688, "completions/mean_terminated_length": 600.5814208984375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 8.681049562682215, "grad_norm": 0.1677822768688202, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 605503049.0, "reward": 0.668526828289032, "reward_std": 0.15466485917568207, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3318.0, "completions/mean_length": 647.0189819335938, "completions/mean_terminated_length": 596.2412109375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 8.690379008746355, "grad_norm": 0.15574854612350464, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 606169090.0, "reward": 0.7187500596046448, "reward_std": 0.13842841982841492, "rewards/simpleverify_reward/mean": 0.71875, "rewards/simpleverify_reward/std": 0.4498603343963623, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3704.0, "completions/mean_length": 675.6350708007812, "completions/mean_terminated_length": 660.297119140625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 8.699708454810496, "grad_norm": 0.15461820363998413, "learning_rate": 1e-06, "loss": 0.0238, "num_tokens": 606867899.0, "reward": 0.6629464626312256, "reward_std": 0.162291020154953, "rewards/simpleverify_reward/mean": 0.6629464030265808, "rewards/simpleverify_reward/std": 0.47296738624572754, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3297.0, "completions/mean_length": 616.9631958007812, "completions/mean_terminated_length": 597.4400024414062, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 8.709037900874636, "grad_norm": 0.16049326956272125, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 607514306.0, "reward": 0.6551339626312256, "reward_std": 0.14887428283691406, "rewards/simpleverify_reward/mean": 0.6551339030265808, "rewards/simpleverify_reward/std": 0.4755900800228119, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2771.0, "completions/mean_length": 663.6920166015625, "completions/mean_terminated_length": 644.4310302734375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 8.718367346938775, "grad_norm": 0.13569866120815277, "learning_rate": 1e-06, "loss": 0.0104, "num_tokens": 608197662.0, "reward": 0.6707589626312256, "reward_std": 0.09953919798135757, "rewards/simpleverify_reward/mean": 0.6707589030265808, "rewards/simpleverify_reward/std": 0.4702001214027405, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3028.0, "completions/mean_length": 647.7980346679688, "completions/mean_terminated_length": 640.0839233398438, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 8.727696793002915, "grad_norm": 0.14668728411197662, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 608864257.0, "reward": 0.6305803656578064, "reward_std": 0.1453794538974762, "rewards/simpleverify_reward/mean": 0.6305803656578064, "rewards/simpleverify_reward/std": 0.48291724920272827, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 590.7120971679688, "completions/mean_terminated_length": 567.0809326171875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 8.737026239067056, "grad_norm": 0.18246975541114807, "learning_rate": 1e-06, "loss": 0.0293, "num_tokens": 609472999.0, "reward": 0.6986607313156128, "reward_std": 0.16735777258872986, "rewards/simpleverify_reward/mean": 0.6986607313156128, "rewards/simpleverify_reward/std": 0.4590960144996643, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2139.0, "completions/mean_length": 593.0803833007812, "completions/mean_terminated_length": 569.4651489257812, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 8.746355685131196, "grad_norm": 0.1776711642742157, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 610092511.0, "reward": 0.668526828289032, "reward_std": 0.15590409934520721, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3970.0, "completions/mean_length": 601.3170166015625, "completions/mean_terminated_length": 589.5767211914062, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 8.755685131195335, "grad_norm": 0.16596218943595886, "learning_rate": 1e-06, "loss": 0.0287, "num_tokens": 610724507.0, "reward": 0.7064732313156128, "reward_std": 0.15149745345115662, "rewards/simpleverify_reward/mean": 0.7064732313156128, "rewards/simpleverify_reward/std": 0.4556320011615753, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2752.0, "completions/mean_length": 652.4397583007812, "completions/mean_terminated_length": 640.8712158203125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 8.765014577259475, "grad_norm": 0.15928713977336884, "learning_rate": 1e-06, "loss": 0.0121, "num_tokens": 611395765.0, "reward": 0.6651785969734192, "reward_std": 0.15811441838741302, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219157218933105, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 534.2455444335938, "completions/mean_terminated_length": 522.2799682617188, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 8.774344023323614, "grad_norm": 0.13102464377880096, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 611958425.0, "reward": 0.7712053656578064, "reward_std": 0.07507815212011337, "rewards/simpleverify_reward/mean": 0.7712053656578064, "rewards/simpleverify_reward/std": 0.42029133439064026, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3092.0, "completions/mean_length": 651.0736694335938, "completions/mean_terminated_length": 620.038330078125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 8.783673469387756, "grad_norm": 0.16787542402744293, "learning_rate": 1e-06, "loss": 0.0416, "num_tokens": 612635467.0, "reward": 0.6674107313156128, "reward_std": 0.14229939877986908, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2997.0, "completions/max_terminated_length": 2997.0, "completions/mean_length": 569.091552734375, "completions/mean_terminated_length": 569.091552734375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 8.793002915451895, "grad_norm": 0.17360210418701172, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 613224437.0, "reward": 0.7209821939468384, "reward_std": 0.13493356108665466, "rewards/simpleverify_reward/mean": 0.7209821343421936, "rewards/simpleverify_reward/std": 0.448766827583313, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4052.0, "completions/mean_length": 612.5971069335938, "completions/mean_terminated_length": 581.215087890625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 8.802332361516035, "grad_norm": 0.14062869548797607, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 613858420.0, "reward": 0.6674107313156128, "reward_std": 0.10990840196609497, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3132.0, "completions/mean_length": 592.3147583007812, "completions/mean_terminated_length": 584.4765014648438, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 8.811661807580174, "grad_norm": 0.13622620701789856, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 614479270.0, "reward": 0.7098214626312256, "reward_std": 0.12279380857944489, "rewards/simpleverify_reward/mean": 0.7098214030265808, "rewards/simpleverify_reward/std": 0.454098105430603, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3938.0, "completions/mean_length": 626.9486694335938, "completions/mean_terminated_length": 615.2945556640625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 8.820991253644316, "grad_norm": 0.1522369682788849, "learning_rate": 1e-06, "loss": 0.0092, "num_tokens": 615126608.0, "reward": 0.6819196939468384, "reward_std": 0.12628935277462006, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 4071.0, "completions/mean_length": 628.4933471679688, "completions/mean_terminated_length": 612.9439697265625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 8.830320699708455, "grad_norm": 0.14942966401576996, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 615773610.0, "reward": 0.6696428656578064, "reward_std": 0.12854455411434174, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.4706043601036072, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2764.0, "completions/mean_length": 573.984375, "completions/mean_terminated_length": 562.15234375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 8.839650145772595, "grad_norm": 0.17966817319393158, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 616371644.0, "reward": 0.6975446939468384, "reward_std": 0.13267697393894196, "rewards/simpleverify_reward/mean": 0.6975446343421936, "rewards/simpleverify_reward/std": 0.45957788825035095, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 615.1171875, "completions/mean_terminated_length": 615.1171875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 8.848979591836734, "grad_norm": 0.16204462945461273, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 617005309.0, "reward": 0.6462053656578064, "reward_std": 0.1386120617389679, "rewards/simpleverify_reward/mean": 0.6462053656578064, "rewards/simpleverify_reward/std": 0.478413462638855, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2828.0, "completions/mean_length": 661.8582763671875, "completions/mean_terminated_length": 646.4585571289062, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 8.858309037900874, "grad_norm": 0.13696114718914032, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 617688454.0, "reward": 0.6953125596046448, "reward_std": 0.13444432616233826, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2565.0, "completions/mean_length": 599.7310791015625, "completions/mean_terminated_length": 595.8245849609375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 8.867638483965015, "grad_norm": 0.18985693156719208, "learning_rate": 1e-06, "loss": 0.0069, "num_tokens": 618312581.0, "reward": 0.6752232313156128, "reward_std": 0.13147485256195068, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 603.8192138671875, "completions/mean_terminated_length": 599.9172973632812, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 8.876967930029155, "grad_norm": 0.1510329246520996, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 618943659.0, "reward": 0.6696428656578064, "reward_std": 0.1278284639120102, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.47060438990592957, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 612.4921875, "completions/mean_terminated_length": 596.87109375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 8.886297376093294, "grad_norm": 0.15741018950939178, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 619586492.0, "reward": 0.6540178656578064, "reward_std": 0.1282082498073578, "rewards/simpleverify_reward/mean": 0.6540178656578064, "rewards/simpleverify_reward/std": 0.4759531021118164, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3965.0, "completions/mean_length": 623.2232666015625, "completions/mean_terminated_length": 619.343017578125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 8.895626822157434, "grad_norm": 0.12903398275375366, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 620234948.0, "reward": 0.6908482313156128, "reward_std": 0.1090407744050026, "rewards/simpleverify_reward/mean": 0.6908482313156128, "rewards/simpleverify_reward/std": 0.46240198612213135, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 578.4609375, "completions/mean_terminated_length": 570.5917358398438, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 8.904956268221575, "grad_norm": 0.1666562557220459, "learning_rate": 1e-06, "loss": 0.0326, "num_tokens": 620844425.0, "reward": 0.6774553656578064, "reward_std": 0.12633171677589417, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3297.0, "completions/mean_length": 613.6517944335938, "completions/mean_terminated_length": 598.035888671875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 8.914285714285715, "grad_norm": 0.16606545448303223, "learning_rate": 1e-06, "loss": 0.0274, "num_tokens": 621485489.0, "reward": 0.7165178656578064, "reward_std": 0.1525951325893402, "rewards/simpleverify_reward/mean": 0.7165178656578064, "rewards/simpleverify_reward/std": 0.4509401023387909, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2678.0, "completions/mean_length": 614.8192138671875, "completions/mean_terminated_length": 587.4083251953125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 8.923615160349854, "grad_norm": 0.17255182564258575, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 622120087.0, "reward": 0.6964285969734192, "reward_std": 0.14771243929862976, "rewards/simpleverify_reward/mean": 0.6964285969734192, "rewards/simpleverify_reward/std": 0.4600566029548645, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3101.0, "completions/mean_length": 642.0390625, "completions/mean_terminated_length": 614.842529296875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 8.932944606413994, "grad_norm": 0.14393678307533264, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 622785586.0, "reward": 0.6595982313156128, "reward_std": 0.12500804662704468, "rewards/simpleverify_reward/mean": 0.6595982313156128, "rewards/simpleverify_reward/std": 0.4741089344024658, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 669.5201416015625, "completions/mean_terminated_length": 646.4202270507812, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 8.942274052478133, "grad_norm": 0.1427735537290573, "learning_rate": 1e-06, "loss": 0.0182, "num_tokens": 623477156.0, "reward": 0.6160714626312256, "reward_std": 0.13106118142604828, "rewards/simpleverify_reward/mean": 0.6160714030265808, "rewards/simpleverify_reward/std": 0.486612468957901, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2462.0, "completions/max_terminated_length": 2462.0, "completions/mean_length": 639.989990234375, "completions/mean_terminated_length": 639.989990234375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 8.951603498542275, "grad_norm": 0.15604734420776367, "learning_rate": 1e-06, "loss": -0.0065, "num_tokens": 624145835.0, "reward": 0.6741071939468384, "reward_std": 0.13973930478096008, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2430.0, "completions/mean_length": 660.193115234375, "completions/mean_terminated_length": 617.4881591796875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 8.960932944606414, "grad_norm": 0.134669691324234, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 624828576.0, "reward": 0.6953125596046448, "reward_std": 0.11592086404561996, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3728.0, "completions/mean_length": 663.8192138671875, "completions/mean_terminated_length": 652.2889404296875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 8.970262390670554, "grad_norm": 0.16197822988033295, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 625520078.0, "reward": 0.7008928656578064, "reward_std": 0.14150775969028473, "rewards/simpleverify_reward/mean": 0.7008928656578064, "rewards/simpleverify_reward/std": 0.458122581243515, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3441.0, "completions/mean_length": 603.0067138671875, "completions/mean_terminated_length": 583.4052124023438, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 8.979591836734693, "grad_norm": 0.15663114190101624, "learning_rate": 1e-06, "loss": 0.0115, "num_tokens": 626148932.0, "reward": 0.7098214626312256, "reward_std": 0.14447860419750214, "rewards/simpleverify_reward/mean": 0.7098214030265808, "rewards/simpleverify_reward/std": 0.454098105430603, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 622.5502319335938, "completions/mean_terminated_length": 603.0584106445312, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 8.988921282798835, "grad_norm": 0.18006478250026703, "learning_rate": 1e-06, "loss": 0.0366, "num_tokens": 626804009.0, "reward": 0.6573660969734192, "reward_std": 0.16874577105045319, "rewards/simpleverify_reward/mean": 0.6573660969734192, "rewards/simpleverify_reward/std": 0.47485533356666565, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0028409090909090606, "completions/max_length": 4096.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 640.0227661132812, "completions/mean_terminated_length": 630.1766357421875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 8.998250728862974, "grad_norm": 0.1681492179632187, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 627490464.0, "reward": 0.6350446939468384, "reward_std": 0.15601149201393127, "rewards/simpleverify_reward/mean": 0.6350446343421936, "rewards/simpleverify_reward/std": 0.481686532497406, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2924.0, "completions/mean_length": 658.779052734375, "completions/mean_terminated_length": 654.9385375976562, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 9.00932944606414, "grad_norm": 0.1535021960735321, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 628172938.0, "reward": 0.7020089626312256, "reward_std": 0.16450412571430206, "rewards/simpleverify_reward/mean": 0.7020089030265808, "rewards/simpleverify_reward/std": 0.45763099193573, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3604.0, "completions/mean_length": 588.1785888671875, "completions/mean_terminated_length": 572.448486328125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 9.018658892128279, "grad_norm": 0.15862442553043365, "learning_rate": 1e-06, "loss": 0.0121, "num_tokens": 628785930.0, "reward": 0.6986607313156128, "reward_std": 0.11963778734207153, "rewards/simpleverify_reward/mean": 0.6986607313156128, "rewards/simpleverify_reward/std": 0.4590960443019867, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2906.0, "completions/mean_length": 607.904052734375, "completions/mean_terminated_length": 592.2623901367188, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 9.02798833819242, "grad_norm": 0.1726294308900833, "learning_rate": 1e-06, "loss": 0.0286, "num_tokens": 629427284.0, "reward": 0.6819196939468384, "reward_std": 0.1563466638326645, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3471.0, "completions/mean_length": 656.6741333007812, "completions/mean_terminated_length": 641.2511596679688, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 9.03731778425656, "grad_norm": 0.15212643146514893, "learning_rate": 1e-06, "loss": 0.0206, "num_tokens": 630106064.0, "reward": 0.6830357313156128, "reward_std": 0.151052787899971, "rewards/simpleverify_reward/mean": 0.6830357313156128, "rewards/simpleverify_reward/std": 0.46555325388908386, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2427.0, "completions/mean_length": 632.4397583007812, "completions/mean_terminated_length": 628.56982421875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 9.0466472303207, "grad_norm": 0.1583513766527176, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 630778578.0, "reward": 0.6473214626312256, "reward_std": 0.13778649270534515, "rewards/simpleverify_reward/mean": 0.6473214030265808, "rewards/simpleverify_reward/std": 0.47807058691978455, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3335.0, "completions/mean_length": 606.146240234375, "completions/mean_terminated_length": 590.4966430664062, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 9.055976676384839, "grad_norm": 0.1665887087583542, "learning_rate": 1e-06, "loss": 0.0139, "num_tokens": 631406325.0, "reward": 0.7109375596046448, "reward_std": 0.1413230001926422, "rewards/simpleverify_reward/mean": 0.7109375, "rewards/simpleverify_reward/std": 0.45358020067214966, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2640.0, "completions/mean_length": 570.7455444335938, "completions/mean_terminated_length": 566.8067016601562, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 9.06530612244898, "grad_norm": 0.2378007471561432, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 632002241.0, "reward": 0.7031250596046448, "reward_std": 0.1389169543981552, "rewards/simpleverify_reward/mean": 0.703125, "rewards/simpleverify_reward/std": 0.4571361541748047, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3658.0, "completions/mean_length": 650.7109375, "completions/mean_terminated_length": 623.5827026367188, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 9.07463556851312, "grad_norm": 0.14771103858947754, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 632677686.0, "reward": 0.6517857313156128, "reward_std": 0.13136568665504456, "rewards/simpleverify_reward/mean": 0.6517857313156128, "rewards/simpleverify_reward/std": 0.47667041420936584, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2150.0, "completions/mean_length": 597.513427734375, "completions/mean_terminated_length": 585.7603759765625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 9.08396501457726, "grad_norm": 0.16702693700790405, "learning_rate": 1e-06, "loss": 0.0255, "num_tokens": 633295978.0, "reward": 0.699776828289032, "reward_std": 0.14891524612903595, "rewards/simpleverify_reward/mean": 0.6997767686843872, "rewards/simpleverify_reward/std": 0.4586108922958374, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3656.0, "completions/mean_length": 632.4051513671875, "completions/mean_terminated_length": 620.7693481445312, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 9.093294460641399, "grad_norm": 0.15965713560581207, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 633950637.0, "reward": 0.6886160969734192, "reward_std": 0.1551099568605423, "rewards/simpleverify_reward/mean": 0.6886160969734192, "rewards/simpleverify_reward/std": 0.46331799030303955, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3084.0, "completions/max_terminated_length": 3084.0, "completions/mean_length": 657.7600708007812, "completions/mean_terminated_length": 657.7600708007812, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 9.102623906705539, "grad_norm": 0.15465950965881348, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 634620102.0, "reward": 0.6082589626312256, "reward_std": 0.14782078564167023, "rewards/simpleverify_reward/mean": 0.6082589030265808, "rewards/simpleverify_reward/std": 0.4884119927883148, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 621.8236694335938, "completions/mean_terminated_length": 617.94189453125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 9.11195335276968, "grad_norm": 0.2227480113506317, "learning_rate": 1e-06, "loss": 0.0196, "num_tokens": 635282768.0, "reward": 0.6517857313156128, "reward_std": 0.18272632360458374, "rewards/simpleverify_reward/mean": 0.6517857313156128, "rewards/simpleverify_reward/std": 0.47667041420936584, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3558.0, "completions/mean_length": 584.5167846679688, "completions/mean_terminated_length": 580.59326171875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 9.12128279883382, "grad_norm": 0.151648610830307, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 635898671.0, "reward": 0.6484375, "reward_std": 0.12741731107234955, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2888.0, "completions/mean_length": 679.2221069335938, "completions/mean_terminated_length": 663.9002685546875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 9.130612244897959, "grad_norm": 0.15589477121829987, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 636600206.0, "reward": 0.6796875596046448, "reward_std": 0.14623749256134033, "rewards/simpleverify_reward/mean": 0.6796875, "rewards/simpleverify_reward/std": 0.4668572247028351, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3531.0, "completions/mean_length": 636.5279541015625, "completions/mean_terminated_length": 621.0145874023438, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 9.139941690962099, "grad_norm": 0.14637501537799835, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 637259303.0, "reward": 0.699776828289032, "reward_std": 0.12820753455162048, "rewards/simpleverify_reward/mean": 0.6997767686843872, "rewards/simpleverify_reward/std": 0.4586109220981598, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 613.8739013671875, "completions/mean_terminated_length": 606.0839233398438, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 9.14927113702624, "grad_norm": 0.15651018917560577, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 637904126.0, "reward": 0.7087053656578064, "reward_std": 0.13632294535636902, "rewards/simpleverify_reward/mean": 0.7087053656578064, "rewards/simpleverify_reward/std": 0.45461276173591614, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2676.0, "completions/mean_length": 629.8527221679688, "completions/mean_terminated_length": 606.4854125976562, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 9.15860058309038, "grad_norm": 0.17067843675613403, "learning_rate": 1e-06, "loss": 0.0331, "num_tokens": 638559674.0, "reward": 0.6651785969734192, "reward_std": 0.1720215231180191, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219157218933105, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2435.0, "completions/mean_length": 587.6897583007812, "completions/mean_terminated_length": 571.9573974609375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 9.167930029154519, "grad_norm": 0.18658314645290375, "learning_rate": 1e-06, "loss": 0.0097, "num_tokens": 639183668.0, "reward": 0.6863839626312256, "reward_std": 0.16499380767345428, "rewards/simpleverify_reward/mean": 0.6863839030265808, "rewards/simpleverify_reward/std": 0.46422141790390015, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2802.0, "completions/mean_length": 592.8471069335938, "completions/mean_terminated_length": 585.0100708007812, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 9.177259475218658, "grad_norm": 0.16177615523338318, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 639794315.0, "reward": 0.6819196939468384, "reward_std": 0.12328417599201202, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3407.0, "completions/mean_length": 598.8850708007812, "completions/mean_terminated_length": 575.3090209960938, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 9.186588921282798, "grad_norm": 0.17306625843048096, "learning_rate": 1e-06, "loss": 0.0324, "num_tokens": 640416268.0, "reward": 0.7511160969734192, "reward_std": 0.14289601147174835, "rewards/simpleverify_reward/mean": 0.7511160969734192, "rewards/simpleverify_reward/std": 0.43260788917541504, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 650.640625, "completions/mean_terminated_length": 635.1906127929688, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 9.19591836734694, "grad_norm": 0.14807884395122528, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 641088994.0, "reward": 0.6696428656578064, "reward_std": 0.12253671884536743, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.47060438990592957, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3542.0, "completions/mean_length": 563.0647583007812, "completions/mean_terminated_length": 559.1173095703125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 9.205247813411079, "grad_norm": 0.16315820813179016, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 641665172.0, "reward": 0.7500000596046448, "reward_std": 0.12422849237918854, "rewards/simpleverify_reward/mean": 0.75, "rewards/simpleverify_reward/std": 0.43325456976890564, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 572.5703125, "completions/mean_terminated_length": 564.6879272460938, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 9.214577259475218, "grad_norm": 0.1702897697687149, "learning_rate": 1e-06, "loss": 0.0147, "num_tokens": 642263779.0, "reward": 0.7745535969734192, "reward_std": 0.14083515107631683, "rewards/simpleverify_reward/mean": 0.7745535969734192, "rewards/simpleverify_reward/std": 0.41810935735702515, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3000.0, "completions/mean_length": 688.3035888671875, "completions/mean_terminated_length": 665.330322265625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 9.223906705539358, "grad_norm": 0.1410369873046875, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 642979955.0, "reward": 0.6573660969734192, "reward_std": 0.11930078268051147, "rewards/simpleverify_reward/mean": 0.6573660969734192, "rewards/simpleverify_reward/std": 0.47485536336898804, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3278.0, "completions/mean_length": 668.0859375, "completions/mean_terminated_length": 660.417236328125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 9.2332361516035, "grad_norm": 0.17257247865200043, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 643678440.0, "reward": 0.6462053656578064, "reward_std": 0.17299792170524597, "rewards/simpleverify_reward/mean": 0.6462053656578064, "rewards/simpleverify_reward/std": 0.478413462638855, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 590.40625, "completions/mean_terminated_length": 570.7340087890625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 9.242565597667639, "grad_norm": 0.17454563081264496, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 644291076.0, "reward": 0.7165178656578064, "reward_std": 0.14873048663139343, "rewards/simpleverify_reward/mean": 0.7165178656578064, "rewards/simpleverify_reward/std": 0.4509401023387909, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 605.1585083007812, "completions/mean_terminated_length": 593.43115234375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 9.251895043731778, "grad_norm": 0.17182663083076477, "learning_rate": 1e-06, "loss": 0.0283, "num_tokens": 644930682.0, "reward": 0.6752232313156128, "reward_std": 0.17328962683677673, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2389.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 594.078125, "completions/mean_terminated_length": 594.078125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 9.261224489795918, "grad_norm": 0.15513907372951508, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 645565104.0, "reward": 0.6540178656578064, "reward_std": 0.12471703439950943, "rewards/simpleverify_reward/mean": 0.6540178656578064, "rewards/simpleverify_reward/std": 0.4759531021118164, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2111.0, "completions/mean_length": 642.8214721679688, "completions/mean_terminated_length": 615.6310424804688, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 9.270553935860057, "grad_norm": 0.15659648180007935, "learning_rate": 1e-06, "loss": 0.0202, "num_tokens": 646227408.0, "reward": 0.6439732313156128, "reward_std": 0.13831782341003418, "rewards/simpleverify_reward/mean": 0.6439732313156128, "rewards/simpleverify_reward/std": 0.47909072041511536, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3021.0, "completions/mean_length": 628.1127319335938, "completions/mean_terminated_length": 620.3546142578125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 9.279883381924199, "grad_norm": 0.14926894009113312, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 646889405.0, "reward": 0.6886160969734192, "reward_std": 0.1180230975151062, "rewards/simpleverify_reward/mean": 0.6886160969734192, "rewards/simpleverify_reward/std": 0.46331799030303955, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 633.6116333007812, "completions/mean_terminated_length": 606.3487548828125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 9.289212827988338, "grad_norm": 0.16376249492168427, "learning_rate": 1e-06, "loss": 0.022, "num_tokens": 647540297.0, "reward": 0.7377232313156128, "reward_std": 0.1557825356721878, "rewards/simpleverify_reward/mean": 0.7377232313156128, "rewards/simpleverify_reward/std": 0.4401180148124695, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3149.0, "completions/mean_length": 607.950927734375, "completions/mean_terminated_length": 596.2329711914062, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 9.298542274052478, "grad_norm": 0.1467505693435669, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 648177565.0, "reward": 0.676339328289032, "reward_std": 0.14075595140457153, "rewards/simpleverify_reward/mean": 0.6763392686843872, "rewards/simpleverify_reward/std": 0.4681335687637329, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 607.3549194335938, "completions/mean_terminated_length": 603.4569702148438, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 9.307871720116617, "grad_norm": 0.159001886844635, "learning_rate": 1e-06, "loss": 0.012, "num_tokens": 648814195.0, "reward": 0.715401828289032, "reward_std": 0.10615323483943939, "rewards/simpleverify_reward/mean": 0.7154017686843872, "rewards/simpleverify_reward/std": 0.4514748752117157, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 592.8303833007812, "completions/mean_terminated_length": 581.0615844726562, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 9.317201166180759, "grad_norm": 0.16901041567325592, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 649432699.0, "reward": 0.6830357313156128, "reward_std": 0.14222271740436554, "rewards/simpleverify_reward/mean": 0.6830357313156128, "rewards/simpleverify_reward/std": 0.46555325388908386, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3440.0, "completions/mean_length": 616.4017944335938, "completions/mean_terminated_length": 589.00341796875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 9.326530612244898, "grad_norm": 0.14833012223243713, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 650062707.0, "reward": 0.7254464626312256, "reward_std": 0.13034509122371674, "rewards/simpleverify_reward/mean": 0.7254464030265808, "rewards/simpleverify_reward/std": 0.4465382993221283, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2095.0, "completions/mean_length": 552.1998291015625, "completions/mean_terminated_length": 548.240234375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 9.335860058309038, "grad_norm": 0.15463897585868835, "learning_rate": 1e-06, "loss": 0.0134, "num_tokens": 650648854.0, "reward": 0.7187500596046448, "reward_std": 0.12373881042003632, "rewards/simpleverify_reward/mean": 0.71875, "rewards/simpleverify_reward/std": 0.4498603343963623, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3812.0, "completions/mean_length": 626.1105346679688, "completions/mean_terminated_length": 614.4535522460938, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 9.345189504373177, "grad_norm": 0.13659296929836273, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 651304569.0, "reward": 0.6819196939468384, "reward_std": 0.11362924426794052, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3582.0, "completions/mean_length": 620.7265625, "completions/mean_terminated_length": 601.2244873046875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 9.354518950437317, "grad_norm": 0.19725051522254944, "learning_rate": 1e-06, "loss": 0.0301, "num_tokens": 651954588.0, "reward": 0.684151828289032, "reward_std": 0.19181813299655914, "rewards/simpleverify_reward/mean": 0.6841517686843872, "rewards/simpleverify_reward/std": 0.4651124179363251, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3611.0, "completions/mean_length": 608.296875, "completions/mean_terminated_length": 580.8346557617188, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 9.363848396501458, "grad_norm": 0.1407598853111267, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 652582326.0, "reward": 0.6897321939468384, "reward_std": 0.10292234271764755, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2319.0, "completions/mean_length": 603.8638916015625, "completions/mean_terminated_length": 592.1321411132812, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 9.373177842565598, "grad_norm": 0.16631509363651276, "learning_rate": 1e-06, "loss": 0.0104, "num_tokens": 653211076.0, "reward": 0.6651785969734192, "reward_std": 0.15405797958374023, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219160199165344, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 607.7734375, "completions/mean_terminated_length": 596.0548706054688, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 9.382507288629737, "grad_norm": 0.17286458611488342, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 653847097.0, "reward": 0.6953125596046448, "reward_std": 0.1652202308177948, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 624.8515625, "completions/mean_terminated_length": 620.97314453125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 9.391836734693877, "grad_norm": 0.1469295620918274, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 654491996.0, "reward": 0.6808035969734192, "reward_std": 0.11422586441040039, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.46642565727233887, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3217.0, "completions/mean_length": 649.8248291015625, "completions/mean_terminated_length": 638.2474975585938, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 9.401166180758018, "grad_norm": 0.15304288268089294, "learning_rate": 1e-06, "loss": -0.0037, "num_tokens": 655162847.0, "reward": 0.6517857313156128, "reward_std": 0.13951288163661957, "rewards/simpleverify_reward/mean": 0.6517857313156128, "rewards/simpleverify_reward/std": 0.47667041420936584, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3843.0, "completions/mean_length": 637.357177734375, "completions/mean_terminated_length": 598.320556640625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 9.410495626822158, "grad_norm": 0.2018808126449585, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 655822367.0, "reward": 0.6875000596046448, "reward_std": 0.1601438969373703, "rewards/simpleverify_reward/mean": 0.6875, "rewards/simpleverify_reward/std": 0.4637712836265564, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3148.0, "completions/mean_length": 634.1373291015625, "completions/mean_terminated_length": 618.61328125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 9.419825072886297, "grad_norm": 0.1807619035243988, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 656479138.0, "reward": 0.6439732313156128, "reward_std": 0.15634779632091522, "rewards/simpleverify_reward/mean": 0.6439732313156128, "rewards/simpleverify_reward/std": 0.47909072041511536, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2840.0, "completions/mean_length": 628.6261596679688, "completions/mean_terminated_length": 613.077392578125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 9.429154518950437, "grad_norm": 0.16395609080791473, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 657130219.0, "reward": 0.7020089626312256, "reward_std": 0.13008618354797363, "rewards/simpleverify_reward/mean": 0.7020089030265808, "rewards/simpleverify_reward/std": 0.45763099193573, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2692.0, "completions/mean_length": 621.734375, "completions/mean_terminated_length": 598.3123779296875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 9.438483965014576, "grad_norm": 0.15154406428337097, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 657784573.0, "reward": 0.684151828289032, "reward_std": 0.13230745494365692, "rewards/simpleverify_reward/mean": 0.6841517686843872, "rewards/simpleverify_reward/std": 0.46511244773864746, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 601.21875, "completions/mean_terminated_length": 597.31396484375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 9.447813411078718, "grad_norm": 0.17292281985282898, "learning_rate": 1e-06, "loss": 0.0223, "num_tokens": 658413753.0, "reward": 0.731026828289032, "reward_std": 0.14917738735675812, "rewards/simpleverify_reward/mean": 0.7310267686843872, "rewards/simpleverify_reward/std": 0.44367367029190063, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2731.0, "completions/mean_length": 599.6585083007812, "completions/mean_terminated_length": 576.087646484375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 9.457142857142857, "grad_norm": 0.14632754027843475, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 659040783.0, "reward": 0.7399553656578064, "reward_std": 0.12144014239311218, "rewards/simpleverify_reward/mean": 0.7399553656578064, "rewards/simpleverify_reward/std": 0.43890365958213806, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3663.0, "completions/mean_length": 616.4174194335938, "completions/mean_terminated_length": 600.8139038085938, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 9.466472303206997, "grad_norm": 0.17891216278076172, "learning_rate": 1e-06, "loss": 0.002, "num_tokens": 659699933.0, "reward": 0.6506696939468384, "reward_std": 0.1630052924156189, "rewards/simpleverify_reward/mean": 0.6506696343421936, "rewards/simpleverify_reward/std": 0.47702476382255554, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3917.0, "completions/mean_length": 706.8460083007812, "completions/mean_terminated_length": 687.8272094726562, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 9.475801749271136, "grad_norm": 0.1362696886062622, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 660416747.0, "reward": 0.6517857313156128, "reward_std": 0.13268017768859863, "rewards/simpleverify_reward/mean": 0.6517857313156128, "rewards/simpleverify_reward/std": 0.47667041420936584, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3381.0, "completions/mean_length": 699.0111694335938, "completions/mean_terminated_length": 672.2632446289062, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 9.485131195335278, "grad_norm": 0.1531856209039688, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 661133149.0, "reward": 0.5736607313156128, "reward_std": 0.17682725191116333, "rewards/simpleverify_reward/mean": 0.5736607313156128, "rewards/simpleverify_reward/std": 0.4948205351829529, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3810.0, "completions/mean_length": 584.9006958007812, "completions/mean_terminated_length": 577.0458374023438, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 9.494460641399417, "grad_norm": 0.15526026487350464, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 661737292.0, "reward": 0.7265625596046448, "reward_std": 0.12366396188735962, "rewards/simpleverify_reward/mean": 0.7265625, "rewards/simpleverify_reward/std": 0.4459724426269531, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 1999.0, "completions/mean_length": 610.6551513671875, "completions/mean_terminated_length": 598.9462890625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 9.503790087463557, "grad_norm": 0.1605006456375122, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 662366751.0, "reward": 0.7276785969734192, "reward_std": 0.14879831671714783, "rewards/simpleverify_reward/mean": 0.7276785969734192, "rewards/simpleverify_reward/std": 0.4454030692577362, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2432.0, "completions/mean_length": 610.3671875, "completions/mean_terminated_length": 598.6573486328125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 9.513119533527696, "grad_norm": 0.16288278996944427, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 663007032.0, "reward": 0.7433035969734192, "reward_std": 0.14143246412277222, "rewards/simpleverify_reward/mean": 0.7433035969734192, "rewards/simpleverify_reward/std": 0.43705442547798157, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3628.0, "completions/mean_length": 654.0502319335938, "completions/mean_terminated_length": 623.0416870117188, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 9.522448979591836, "grad_norm": 0.1608903408050537, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 663674261.0, "reward": 0.7343750596046448, "reward_std": 0.14631377160549164, "rewards/simpleverify_reward/mean": 0.734375, "rewards/simpleverify_reward/std": 0.44191211462020874, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2761.0, "completions/mean_length": 637.4810791015625, "completions/mean_terminated_length": 625.8623046875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 9.531778425655977, "grad_norm": 0.1706860512495041, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 664330292.0, "reward": 0.6752232313156128, "reward_std": 0.16713206470012665, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3023.0, "completions/max_terminated_length": 3023.0, "completions/mean_length": 603.5569458007812, "completions/mean_terminated_length": 603.5569458007812, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 9.541107871720117, "grad_norm": 0.17151015996932983, "learning_rate": 1e-06, "loss": 0.0142, "num_tokens": 664958871.0, "reward": 0.6930803656578064, "reward_std": 0.15357153117656708, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147337555885315, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2257.0, "completions/mean_length": 629.1875, "completions/mean_terminated_length": 613.6412963867188, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 9.550437317784256, "grad_norm": 0.15591515600681305, "learning_rate": 1e-06, "loss": 0.0226, "num_tokens": 665610775.0, "reward": 0.7087053656578064, "reward_std": 0.13549810647964478, "rewards/simpleverify_reward/mean": 0.7087053656578064, "rewards/simpleverify_reward/std": 0.45461276173591614, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3243.0, "completions/mean_length": 702.2388916015625, "completions/mean_terminated_length": 690.837646484375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 9.559766763848396, "grad_norm": 0.16866551339626312, "learning_rate": 1e-06, "loss": 0.0239, "num_tokens": 666341125.0, "reward": 0.6183035969734192, "reward_std": 0.15049147605895996, "rewards/simpleverify_reward/mean": 0.6183035969734192, "rewards/simpleverify_reward/std": 0.4860740303993225, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3280.0, "completions/mean_length": 604.1585083007812, "completions/mean_terminated_length": 592.4277954101562, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 9.569096209912537, "grad_norm": 0.16647794842720032, "learning_rate": 1e-06, "loss": 0.012, "num_tokens": 666981819.0, "reward": 0.6964285969734192, "reward_std": 0.1141170859336853, "rewards/simpleverify_reward/mean": 0.6964285969734192, "rewards/simpleverify_reward/std": 0.4600565731525421, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 659.9542846679688, "completions/mean_terminated_length": 617.246337890625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 9.578425655976677, "grad_norm": 0.15608370304107666, "learning_rate": 1e-06, "loss": -0.0061, "num_tokens": 667656242.0, "reward": 0.7020089626312256, "reward_std": 0.12381480634212494, "rewards/simpleverify_reward/mean": 0.7020089030265808, "rewards/simpleverify_reward/std": 0.45763102173805237, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 570.65625, "completions/mean_terminated_length": 558.81298828125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 9.587755102040816, "grad_norm": 0.13728073239326477, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 668264830.0, "reward": 0.6830357313156128, "reward_std": 0.1008187010884285, "rewards/simpleverify_reward/mean": 0.6830357313156128, "rewards/simpleverify_reward/std": 0.46555325388908386, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2714.0, "completions/mean_length": 592.9085083007812, "completions/mean_terminated_length": 588.994384765625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 9.597084548104956, "grad_norm": 0.12550999224185944, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 668882060.0, "reward": 0.715401828289032, "reward_std": 0.0845465213060379, "rewards/simpleverify_reward/mean": 0.7154017686843872, "rewards/simpleverify_reward/std": 0.4514748752117157, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2355.0, "completions/mean_length": 677.9777221679688, "completions/mean_terminated_length": 666.4949951171875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 9.606413994169095, "grad_norm": 0.13813650608062744, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 669583960.0, "reward": 0.6316964626312256, "reward_std": 0.1313222050666809, "rewards/simpleverify_reward/mean": 0.6316964030265808, "rewards/simpleverify_reward/std": 0.4826137125492096, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2489.0, "completions/mean_length": 572.8772583007812, "completions/mean_terminated_length": 564.9955444335938, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 9.615743440233237, "grad_norm": 0.18900643289089203, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 670192026.0, "reward": 0.6718750596046448, "reward_std": 0.13177725672721863, "rewards/simpleverify_reward/mean": 0.671875, "rewards/simpleverify_reward/std": 0.46979284286499023, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2519.0, "completions/mean_length": 555.8170166015625, "completions/mean_terminated_length": 547.8970947265625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 9.625072886297376, "grad_norm": 0.15554741024971008, "learning_rate": 1e-06, "loss": 0.0273, "num_tokens": 670775390.0, "reward": 0.754464328289032, "reward_std": 0.11073324829339981, "rewards/simpleverify_reward/mean": 0.7544642686843872, "rewards/simpleverify_reward/std": 0.4306447505950928, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 618.6138916015625, "completions/mean_terminated_length": 610.83447265625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 9.634402332361516, "grad_norm": 0.1525791734457016, "learning_rate": 1e-06, "loss": 0.0155, "num_tokens": 671419332.0, "reward": 0.6953125596046448, "reward_std": 0.13399037718772888, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 631.6484375, "completions/mean_terminated_length": 620.0100708007812, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 9.643731778425655, "grad_norm": 0.15896205604076385, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 672079529.0, "reward": 0.6875000596046448, "reward_std": 0.15803773701190948, "rewards/simpleverify_reward/mean": 0.6875, "rewards/simpleverify_reward/std": 0.4637712836265564, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2479.0, "completions/max_terminated_length": 2479.0, "completions/mean_length": 591.435302734375, "completions/mean_terminated_length": 591.435302734375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 9.653061224489797, "grad_norm": 0.16745279729366302, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 672693399.0, "reward": 0.707589328289032, "reward_std": 0.12572622299194336, "rewards/simpleverify_reward/mean": 0.7075892686843872, "rewards/simpleverify_reward/std": 0.45512402057647705, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 639.755615234375, "completions/mean_terminated_length": 620.3602905273438, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 9.662390670553936, "grad_norm": 0.1489076018333435, "learning_rate": 1e-06, "loss": 0.0215, "num_tokens": 673356244.0, "reward": 0.6819196939468384, "reward_std": 0.12569203972816467, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3575.0, "completions/mean_length": 589.9866333007812, "completions/mean_terminated_length": 578.2083129882812, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 9.671720116618076, "grad_norm": 0.16098566353321075, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 673969984.0, "reward": 0.7254464626312256, "reward_std": 0.13756009936332703, "rewards/simpleverify_reward/mean": 0.7254464030265808, "rewards/simpleverify_reward/std": 0.4465382993221283, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2962.0, "completions/mean_length": 620.458740234375, "completions/mean_terminated_length": 604.8733520507812, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 9.681049562682215, "grad_norm": 0.17847058176994324, "learning_rate": 1e-06, "loss": 0.0205, "num_tokens": 674614907.0, "reward": 0.6897321939468384, "reward_std": 0.15026254951953888, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3289.0, "completions/mean_length": 634.3671875, "completions/mean_terminated_length": 618.8441772460938, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 9.690379008746355, "grad_norm": 0.14025600254535675, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 675262572.0, "reward": 0.7198660969734192, "reward_std": 0.1180986762046814, "rewards/simpleverify_reward/mean": 0.7198660969734192, "rewards/simpleverify_reward/std": 0.44931527972221375, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4062.0, "completions/mean_length": 652.9230346679688, "completions/mean_terminated_length": 633.6016235351562, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 9.699708454810496, "grad_norm": 0.17019487917423248, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 675931255.0, "reward": 0.6517857313156128, "reward_std": 0.15154021978378296, "rewards/simpleverify_reward/mean": 0.6517857313156128, "rewards/simpleverify_reward/std": 0.47667041420936584, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2147.0, "completions/mean_length": 583.9252319335938, "completions/mean_terminated_length": 576.0682373046875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 9.709037900874636, "grad_norm": 0.1660180240869522, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 676543764.0, "reward": 0.6886160969734192, "reward_std": 0.13008618354797363, "rewards/simpleverify_reward/mean": 0.6886160969734192, "rewards/simpleverify_reward/std": 0.46331802010536194, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2185.0, "completions/mean_length": 644.0792846679688, "completions/mean_terminated_length": 616.8988037109375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 9.718367346938775, "grad_norm": 0.13898447155952454, "learning_rate": 1e-06, "loss": 0.0259, "num_tokens": 677206019.0, "reward": 0.6540178656578064, "reward_std": 0.13365477323532104, "rewards/simpleverify_reward/mean": 0.6540178656578064, "rewards/simpleverify_reward/std": 0.4759531021118164, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 4023.0, "completions/mean_length": 574.2142944335938, "completions/mean_terminated_length": 566.3355712890625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 9.727696793002915, "grad_norm": 0.16513501107692719, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 677811075.0, "reward": 0.7209821939468384, "reward_std": 0.12903422117233276, "rewards/simpleverify_reward/mean": 0.7209821343421936, "rewards/simpleverify_reward/std": 0.448766827583313, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2488.0, "completions/mean_length": 605.8694458007812, "completions/mean_terminated_length": 578.3881225585938, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 9.737026239067056, "grad_norm": 0.15164721012115479, "learning_rate": 1e-06, "loss": 0.0182, "num_tokens": 678436318.0, "reward": 0.7522321939468384, "reward_std": 0.10179439932107925, "rewards/simpleverify_reward/mean": 0.7522321343421936, "rewards/simpleverify_reward/std": 0.4319573938846588, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3190.0, "completions/mean_length": 595.114990234375, "completions/mean_terminated_length": 583.3538818359375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 9.746355685131196, "grad_norm": 0.17198269069194794, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 679056469.0, "reward": 0.6662946939468384, "reward_std": 0.1407923698425293, "rewards/simpleverify_reward/mean": 0.6662946343421936, "rewards/simpleverify_reward/std": 0.47179925441741943, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3476.0, "completions/mean_length": 622.7410888671875, "completions/mean_terminated_length": 611.0728149414062, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 9.755685131195335, "grad_norm": 0.16907845437526703, "learning_rate": 1e-06, "loss": 0.0176, "num_tokens": 679700173.0, "reward": 0.7020089626312256, "reward_std": 0.14260180294513702, "rewards/simpleverify_reward/mean": 0.7020089030265808, "rewards/simpleverify_reward/std": 0.45763099193573, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2302.0, "completions/mean_length": 582.9319458007812, "completions/mean_terminated_length": 563.2177734375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 9.765014577259475, "grad_norm": 0.1463419795036316, "learning_rate": 1e-06, "loss": 0.0239, "num_tokens": 680297448.0, "reward": 0.7209821939468384, "reward_std": 0.11460676044225693, "rewards/simpleverify_reward/mean": 0.7209821343421936, "rewards/simpleverify_reward/std": 0.448766827583313, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3807.0, "completions/mean_length": 719.6283569335938, "completions/mean_terminated_length": 693.0427856445312, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 9.774344023323614, "grad_norm": 0.1890610009431839, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 681036531.0, "reward": 0.660714328289032, "reward_std": 0.14966411888599396, "rewards/simpleverify_reward/mean": 0.6607142686843872, "rewards/simpleverify_reward/std": 0.4737313687801361, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3173.0, "completions/mean_length": 659.3560791015625, "completions/mean_terminated_length": 651.6677856445312, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 9.783673469387756, "grad_norm": 0.1500820815563202, "learning_rate": 1e-06, "loss": 0.0098, "num_tokens": 681724722.0, "reward": 0.621651828289032, "reward_std": 0.12715768814086914, "rewards/simpleverify_reward/mean": 0.6216517686843872, "rewards/simpleverify_reward/std": 0.4852459728717804, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 641.8527221679688, "completions/mean_terminated_length": 626.36328125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 9.793002915451895, "grad_norm": 0.14254878461360931, "learning_rate": 1e-06, "loss": 0.0224, "num_tokens": 682393246.0, "reward": 0.7098214626312256, "reward_std": 0.11840105801820755, "rewards/simpleverify_reward/mean": 0.7098214030265808, "rewards/simpleverify_reward/std": 0.454098105430603, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3408.0, "completions/mean_length": 628.3292846679688, "completions/mean_terminated_length": 616.6797485351562, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 9.802332361516035, "grad_norm": 0.17048749327659607, "learning_rate": 1e-06, "loss": 0.0321, "num_tokens": 683044069.0, "reward": 0.6495535969734192, "reward_std": 0.15015378594398499, "rewards/simpleverify_reward/mean": 0.6495535969734192, "rewards/simpleverify_reward/std": 0.477376252412796, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3077.0, "completions/mean_length": 602.1473388671875, "completions/mean_terminated_length": 594.3311157226562, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 9.811661807580174, "grad_norm": 0.16566908359527588, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 683671433.0, "reward": 0.7712053656578064, "reward_std": 0.13290590047836304, "rewards/simpleverify_reward/mean": 0.7712053656578064, "rewards/simpleverify_reward/std": 0.42029133439064026, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3388.0, "completions/mean_length": 652.271240234375, "completions/mean_terminated_length": 640.7021484375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 9.820991253644316, "grad_norm": 0.17695604264736176, "learning_rate": 1e-06, "loss": 0.0459, "num_tokens": 684336436.0, "reward": 0.707589328289032, "reward_std": 0.1741262972354889, "rewards/simpleverify_reward/mean": 0.7075892686843872, "rewards/simpleverify_reward/std": 0.45512402057647705, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3812.0, "completions/max_terminated_length": 3812.0, "completions/mean_length": 670.8058471679688, "completions/mean_terminated_length": 670.8058471679688, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 9.830320699708455, "grad_norm": 0.13639985024929047, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 685031726.0, "reward": 0.6540178656578064, "reward_std": 0.12253417819738388, "rewards/simpleverify_reward/mean": 0.6540178656578064, "rewards/simpleverify_reward/std": 0.4759531021118164, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 648.849365234375, "completions/mean_terminated_length": 633.3912963867188, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 9.839650145772595, "grad_norm": 0.1845235675573349, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 685695935.0, "reward": 0.676339328289032, "reward_std": 0.16698797047138214, "rewards/simpleverify_reward/mean": 0.6763392686843872, "rewards/simpleverify_reward/std": 0.4681335985660553, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3764.0, "completions/mean_length": 653.3092041015625, "completions/mean_terminated_length": 622.2939453125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 9.848979591836734, "grad_norm": 0.13669440150260925, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 686371356.0, "reward": 0.6238839626312256, "reward_std": 0.09878852218389511, "rewards/simpleverify_reward/mean": 0.6238839030265808, "rewards/simpleverify_reward/std": 0.4846802353858948, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2091.0, "completions/mean_length": 625.989990234375, "completions/mean_terminated_length": 618.22705078125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 9.858309037900874, "grad_norm": 0.13652396202087402, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 687034395.0, "reward": 0.6696428656578064, "reward_std": 0.11081133037805557, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.47060438990592957, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3717.0, "completions/mean_length": 666.0736694335938, "completions/mean_terminated_length": 650.69287109375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 9.867638483965015, "grad_norm": 0.14149810373783112, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 687725405.0, "reward": 0.6462053656578064, "reward_std": 0.1253136694431305, "rewards/simpleverify_reward/mean": 0.6462053656578064, "rewards/simpleverify_reward/std": 0.478413462638855, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 656.7980346679688, "completions/mean_terminated_length": 617.9808349609375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 9.876967930029155, "grad_norm": 0.13617300987243652, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 688401000.0, "reward": 0.684151828289032, "reward_std": 0.10821591317653656, "rewards/simpleverify_reward/mean": 0.6841517686843872, "rewards/simpleverify_reward/std": 0.4651124179363251, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2529.0, "completions/mean_length": 644.724365234375, "completions/mean_terminated_length": 621.4573364257812, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 9.886297376093294, "grad_norm": 0.16267941892147064, "learning_rate": 1e-06, "loss": 0.0164, "num_tokens": 689072521.0, "reward": 0.6774553656578064, "reward_std": 0.1708482801914215, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2745.0, "completions/mean_length": 643.2902221679688, "completions/mean_terminated_length": 631.6909790039062, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 9.895626822157434, "grad_norm": 0.1690305918455124, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 689735421.0, "reward": 0.645089328289032, "reward_std": 0.15601077675819397, "rewards/simpleverify_reward/mean": 0.6450892686843872, "rewards/simpleverify_reward/std": 0.4787535071372986, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3945.0, "completions/mean_length": 599.7678833007812, "completions/mean_terminated_length": 588.0223999023438, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 9.904956268221575, "grad_norm": 0.14511066675186157, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 690358997.0, "reward": 0.6897321939468384, "reward_std": 0.10190316289663315, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.462861567735672, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2527.0, "completions/max_terminated_length": 2527.0, "completions/mean_length": 529.9152221679688, "completions/mean_terminated_length": 529.9152221679688, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 9.914285714285715, "grad_norm": 0.17130878567695618, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 690914425.0, "reward": 0.7377232313156128, "reward_std": 0.13177795708179474, "rewards/simpleverify_reward/mean": 0.7377232313156128, "rewards/simpleverify_reward/std": 0.4401180148124695, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2669.0, "completions/mean_length": 622.341552734375, "completions/mean_terminated_length": 594.9899291992188, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 9.923615160349854, "grad_norm": 0.15637469291687012, "learning_rate": 1e-06, "loss": 0.0218, "num_tokens": 691560899.0, "reward": 0.7020089626312256, "reward_std": 0.14214786887168884, "rewards/simpleverify_reward/mean": 0.7020089030265808, "rewards/simpleverify_reward/std": 0.45763102173805237, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2444.0, "completions/mean_length": 582.7254638671875, "completions/mean_terminated_length": 574.8657836914062, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 9.932944606413994, "grad_norm": 0.16943387687206268, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 692178621.0, "reward": 0.6908482313156128, "reward_std": 0.13696233928203583, "rewards/simpleverify_reward/mean": 0.6908482313156128, "rewards/simpleverify_reward/std": 0.46240198612213135, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2615.0, "completions/mean_length": 691.4766235351562, "completions/mean_terminated_length": 680.0392456054688, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 9.942274052478133, "grad_norm": 0.1340373158454895, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 692878024.0, "reward": 0.7042410969734192, "reward_std": 0.1180986762046814, "rewards/simpleverify_reward/mean": 0.7042410969734192, "rewards/simpleverify_reward/std": 0.45663803815841675, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3365.0, "completions/mean_length": 656.7745971679688, "completions/mean_terminated_length": 645.2206420898438, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 9.951603498542275, "grad_norm": 0.14161759614944458, "learning_rate": 1e-06, "loss": 0.022, "num_tokens": 693558398.0, "reward": 0.7087053656578064, "reward_std": 0.13031259179115295, "rewards/simpleverify_reward/mean": 0.7087053656578064, "rewards/simpleverify_reward/std": 0.45461276173591614, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3394.0, "completions/mean_length": 632.8125, "completions/mean_terminated_length": 621.1781005859375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 9.960932944606414, "grad_norm": 0.15689370036125183, "learning_rate": 1e-06, "loss": 0.0287, "num_tokens": 694216574.0, "reward": 0.6584821939468384, "reward_std": 0.13771232962608337, "rewards/simpleverify_reward/mean": 0.6584821343421936, "rewards/simpleverify_reward/std": 0.4744836091995239, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2243.0, "completions/mean_length": 640.982177734375, "completions/mean_terminated_length": 621.59375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 9.970262390670554, "grad_norm": 0.15540839731693268, "learning_rate": 1e-06, "loss": 0.0485, "num_tokens": 694883678.0, "reward": 0.7008928656578064, "reward_std": 0.15785479545593262, "rewards/simpleverify_reward/mean": 0.7008928656578064, "rewards/simpleverify_reward/std": 0.458122581243515, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3270.0, "completions/mean_length": 612.2957763671875, "completions/mean_terminated_length": 596.6737670898438, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 9.979591836734693, "grad_norm": 0.15626201033592224, "learning_rate": 1e-06, "loss": 0.0045, "num_tokens": 695511239.0, "reward": 0.7566964626312256, "reward_std": 0.11753525584936142, "rewards/simpleverify_reward/mean": 0.7566964030265808, "rewards/simpleverify_reward/std": 0.42931652069091797, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3508.0, "completions/mean_length": 641.700927734375, "completions/mean_terminated_length": 630.0963134765625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 9.988921282798835, "grad_norm": 0.1672876924276352, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 696190291.0, "reward": 0.6305803656578064, "reward_std": 0.13121412694454193, "rewards/simpleverify_reward/mean": 0.6305803656578064, "rewards/simpleverify_reward/std": 0.4829172194004059, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011363636363636354, "completions/max_length": 4096.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 711.9318237304688, "completions/mean_terminated_length": 673.0344848632812, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 9.998250728862974, "grad_norm": 0.18073733150959015, "learning_rate": 1e-06, "loss": 0.02, "num_tokens": 696884359.0, "reward": 0.7120535969734192, "reward_std": 0.15995843708515167, "rewards/simpleverify_reward/mean": 0.7120535969734192, "rewards/simpleverify_reward/std": 0.4530589282512665, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3429.0, "completions/mean_length": 645.9710083007812, "completions/mean_terminated_length": 622.7123413085938, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 10.00932944606414, "grad_norm": 0.16506943106651306, "learning_rate": 1e-06, "loss": 0.016, "num_tokens": 697555653.0, "reward": 0.65625, "reward_std": 0.14466658234596252, "rewards/simpleverify_reward/mean": 0.65625, "rewards/simpleverify_reward/std": 0.4752241373062134, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 648.53125, "completions/mean_terminated_length": 636.9496459960938, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 10.018658892128279, "grad_norm": 0.14960868656635284, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 698230145.0, "reward": 0.6640625, "reward_std": 0.14736545085906982, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2623.0, "completions/mean_length": 644.765625, "completions/mean_terminated_length": 633.1713256835938, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 10.02798833819242, "grad_norm": 0.1376456469297409, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 698898783.0, "reward": 0.6941964626312256, "reward_std": 0.12299064546823502, "rewards/simpleverify_reward/mean": 0.6941964030265808, "rewards/simpleverify_reward/std": 0.46100425720214844, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3344.0, "completions/mean_length": 652.9866333007812, "completions/mean_terminated_length": 633.6655883789062, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 10.03731778425656, "grad_norm": 0.14650900661945343, "learning_rate": 1e-06, "loss": 0.0331, "num_tokens": 699569811.0, "reward": 0.7020089626312256, "reward_std": 0.1281333714723587, "rewards/simpleverify_reward/mean": 0.7020089030265808, "rewards/simpleverify_reward/std": 0.45763099193573, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3218.0, "completions/mean_length": 673.4877319335938, "completions/mean_terminated_length": 661.9899291992188, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 10.0466472303207, "grad_norm": 0.1823563426733017, "learning_rate": 1e-06, "loss": 0.0299, "num_tokens": 700268688.0, "reward": 0.6752232313156128, "reward_std": 0.1727689951658249, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3566.0, "completions/mean_length": 603.7957763671875, "completions/mean_terminated_length": 599.8938598632812, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 10.055976676384839, "grad_norm": 0.1735924929380417, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 700886265.0, "reward": 0.7734375596046448, "reward_std": 0.11133874207735062, "rewards/simpleverify_reward/mean": 0.7734375, "rewards/simpleverify_reward/std": 0.4188409447669983, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3769.0, "completions/mean_length": 649.0067138671875, "completions/mean_terminated_length": 633.5493774414062, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 10.06530612244898, "grad_norm": 0.1469613015651703, "learning_rate": 1e-06, "loss": 0.0268, "num_tokens": 701560119.0, "reward": 0.7187500596046448, "reward_std": 0.11268605291843414, "rewards/simpleverify_reward/mean": 0.71875, "rewards/simpleverify_reward/std": 0.4498603343963623, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3975.0, "completions/mean_length": 662.5569458007812, "completions/mean_terminated_length": 654.8758544921875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 10.07463556851312, "grad_norm": 0.1644328236579895, "learning_rate": 1e-06, "loss": 0.0322, "num_tokens": 702241250.0, "reward": 0.7109375596046448, "reward_std": 0.158342644572258, "rewards/simpleverify_reward/mean": 0.7109375, "rewards/simpleverify_reward/std": 0.45358020067214966, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 686.2545166015625, "completions/mean_terminated_length": 647.769775390625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 10.08396501457726, "grad_norm": 0.1560114473104477, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 702958918.0, "reward": 0.6551339626312256, "reward_std": 0.1518348753452301, "rewards/simpleverify_reward/mean": 0.6551339030265808, "rewards/simpleverify_reward/std": 0.4755900800228119, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3821.0, "completions/mean_length": 701.4832763671875, "completions/mean_terminated_length": 674.7548217773438, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 10.093294460641399, "grad_norm": 0.13459162414073944, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 703689319.0, "reward": 0.7109375596046448, "reward_std": 0.11745787411928177, "rewards/simpleverify_reward/mean": 0.7109375, "rewards/simpleverify_reward/std": 0.45358020067214966, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2214.0, "completions/mean_length": 616.4364013671875, "completions/mean_terminated_length": 612.548583984375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 10.102623906705539, "grad_norm": 0.16519084572792053, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 704334310.0, "reward": 0.7098214626312256, "reward_std": 0.13147304952144623, "rewards/simpleverify_reward/mean": 0.7098214030265808, "rewards/simpleverify_reward/std": 0.454098105430603, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 618.5859375, "completions/mean_terminated_length": 610.8065185546875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 10.11195335276968, "grad_norm": 0.13807250559329987, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 704983835.0, "reward": 0.660714328289032, "reward_std": 0.11655844002962112, "rewards/simpleverify_reward/mean": 0.6607142686843872, "rewards/simpleverify_reward/std": 0.4737313687801361, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3898.0, "completions/mean_length": 623.7701416015625, "completions/mean_terminated_length": 619.8905029296875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 10.12128279883382, "grad_norm": 0.1637779176235199, "learning_rate": 1e-06, "loss": 0.0223, "num_tokens": 705642749.0, "reward": 0.6975446939468384, "reward_std": 0.12433655560016632, "rewards/simpleverify_reward/mean": 0.6975446343421936, "rewards/simpleverify_reward/std": 0.45957791805267334, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3656.0, "completions/mean_length": 681.5859375, "completions/mean_terminated_length": 654.7008056640625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 10.130612244897959, "grad_norm": 0.16471588611602783, "learning_rate": 1e-06, "loss": 0.0274, "num_tokens": 706347890.0, "reward": 0.621651828289032, "reward_std": 0.16868045926094055, "rewards/simpleverify_reward/mean": 0.6216517686843872, "rewards/simpleverify_reward/std": 0.4852459728717804, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3709.0, "completions/mean_length": 682.2689819335938, "completions/mean_terminated_length": 655.3892211914062, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 10.139941690962099, "grad_norm": 0.1286601424217224, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 707042851.0, "reward": 0.6897321939468384, "reward_std": 0.10171771794557571, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2608.0, "completions/mean_length": 570.1428833007812, "completions/mean_terminated_length": 562.2550659179688, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 10.14927113702624, "grad_norm": 0.16534437239170074, "learning_rate": 1e-06, "loss": 0.0291, "num_tokens": 707637251.0, "reward": 0.7868303656578064, "reward_std": 0.12166768312454224, "rewards/simpleverify_reward/mean": 0.7868303656578064, "rewards/simpleverify_reward/std": 0.4097752273082733, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4056.0, "completions/mean_length": 615.8348388671875, "completions/mean_terminated_length": 596.3052978515625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 10.15860058309038, "grad_norm": 0.14394022524356842, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 708284127.0, "reward": 0.7165178656578064, "reward_std": 0.10517683625221252, "rewards/simpleverify_reward/mean": 0.7165178656578064, "rewards/simpleverify_reward/std": 0.4509401023387909, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3472.0, "completions/mean_length": 669.5792846679688, "completions/mean_terminated_length": 650.351318359375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 10.167930029154519, "grad_norm": 0.16376285254955292, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 708969094.0, "reward": 0.7209821939468384, "reward_std": 0.1554890275001526, "rewards/simpleverify_reward/mean": 0.7209821343421936, "rewards/simpleverify_reward/std": 0.448766827583313, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2519.0, "completions/mean_length": 683.2467041015625, "completions/mean_terminated_length": 667.94287109375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 10.177259475218658, "grad_norm": 0.13614824414253235, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 709672843.0, "reward": 0.6584821939468384, "reward_std": 0.11486637592315674, "rewards/simpleverify_reward/mean": 0.6584821343421936, "rewards/simpleverify_reward/std": 0.4744836091995239, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3717.0, "completions/mean_length": 576.8795166015625, "completions/mean_terminated_length": 572.9474487304688, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 10.186588921282798, "grad_norm": 0.18253451585769653, "learning_rate": 1e-06, "loss": 0.0179, "num_tokens": 710275151.0, "reward": 0.7734375596046448, "reward_std": 0.13868872821331024, "rewards/simpleverify_reward/mean": 0.7734375, "rewards/simpleverify_reward/std": 0.4188409447669983, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 595.5011596679688, "completions/mean_terminated_length": 559.9830322265625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 10.19591836734694, "grad_norm": 0.17159995436668396, "learning_rate": 1e-06, "loss": 0.0296, "num_tokens": 710892872.0, "reward": 0.7678571939468384, "reward_std": 0.14184294641017914, "rewards/simpleverify_reward/mean": 0.7678571343421936, "rewards/simpleverify_reward/std": 0.422435462474823, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3130.0, "completions/mean_length": 697.583740234375, "completions/mean_terminated_length": 674.6730346679688, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 10.205247813411079, "grad_norm": 0.1491999626159668, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 711609803.0, "reward": 0.6316964626312256, "reward_std": 0.15071649849414825, "rewards/simpleverify_reward/mean": 0.6316964030265808, "rewards/simpleverify_reward/std": 0.4826137125492096, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 654.7589721679688, "completions/mean_terminated_length": 635.4478149414062, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 10.214577259475218, "grad_norm": 0.157756045460701, "learning_rate": 1e-06, "loss": 0.0213, "num_tokens": 712279659.0, "reward": 0.6651785969734192, "reward_std": 0.14301295578479767, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219157218933105, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3704.0, "completions/mean_length": 687.1406860351562, "completions/mean_terminated_length": 652.5523681640625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 10.223906705539358, "grad_norm": 0.17656783759593964, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 712991921.0, "reward": 0.6395089626312256, "reward_std": 0.16206279397010803, "rewards/simpleverify_reward/mean": 0.6395089030265808, "rewards/simpleverify_reward/std": 0.4804111421108246, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3355.0, "completions/max_terminated_length": 3355.0, "completions/mean_length": 630.5245971679688, "completions/mean_terminated_length": 630.5245971679688, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 10.2332361516035, "grad_norm": 0.1623477339744568, "learning_rate": 1e-06, "loss": 0.0233, "num_tokens": 713640847.0, "reward": 0.6741071939468384, "reward_std": 0.15097863972187042, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3428.0, "completions/mean_length": 685.0938110351562, "completions/mean_terminated_length": 665.952880859375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 10.242565597667639, "grad_norm": 0.16317027807235718, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 714345451.0, "reward": 0.6875000596046448, "reward_std": 0.16255316138267517, "rewards/simpleverify_reward/mean": 0.6875, "rewards/simpleverify_reward/std": 0.4637712836265564, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3537.0, "completions/mean_length": 589.5067138671875, "completions/mean_terminated_length": 585.5888061523438, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 10.251895043731778, "grad_norm": 0.14250989258289337, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 714960129.0, "reward": 0.7477678656578064, "reward_std": 0.09468678385019302, "rewards/simpleverify_reward/mean": 0.7477678656578064, "rewards/simpleverify_reward/std": 0.4345363676548004, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3588.0, "completions/mean_length": 701.1105346679688, "completions/mean_terminated_length": 651.1290893554688, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 10.261224489795918, "grad_norm": 0.1558908224105835, "learning_rate": 1e-06, "loss": 0.0247, "num_tokens": 715674492.0, "reward": 0.6975446939468384, "reward_std": 0.1497400850057602, "rewards/simpleverify_reward/mean": 0.6975446343421936, "rewards/simpleverify_reward/std": 0.45957788825035095, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 609.0848388671875, "completions/mean_terminated_length": 593.448486328125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 10.270553935860057, "grad_norm": 0.1572684645652771, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 716313552.0, "reward": 0.7142857313156128, "reward_std": 0.1347392499446869, "rewards/simpleverify_reward/mean": 0.7142857313156128, "rewards/simpleverify_reward/std": 0.4520062506198883, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3542.0, "completions/mean_length": 623.6451416015625, "completions/mean_terminated_length": 608.0740356445312, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 10.279883381924199, "grad_norm": 0.150367870926857, "learning_rate": 1e-06, "loss": 0.0099, "num_tokens": 716973794.0, "reward": 0.6383928656578064, "reward_std": 0.11866319924592972, "rewards/simpleverify_reward/mean": 0.6383928656578064, "rewards/simpleverify_reward/std": 0.4807341992855072, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2273.0, "completions/mean_length": 597.4866333007812, "completions/mean_terminated_length": 593.57763671875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 10.289212827988338, "grad_norm": 0.15089178085327148, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 717599078.0, "reward": 0.6707589626312256, "reward_std": 0.12576758861541748, "rewards/simpleverify_reward/mean": 0.6707589030265808, "rewards/simpleverify_reward/std": 0.4702001214027405, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2976.0, "completions/mean_length": 643.0390625, "completions/mean_terminated_length": 631.43896484375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 10.298542274052478, "grad_norm": 0.17976650595664978, "learning_rate": 1e-06, "loss": 0.0174, "num_tokens": 718267889.0, "reward": 0.6852678656578064, "reward_std": 0.17318014800548553, "rewards/simpleverify_reward/mean": 0.6852678656578064, "rewards/simpleverify_reward/std": 0.4646684527397156, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3764.0, "completions/mean_length": 650.1015625, "completions/mean_terminated_length": 638.5252075195312, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 10.307871720116617, "grad_norm": 0.17093563079833984, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 718943396.0, "reward": 0.6629464626312256, "reward_std": 0.14327509701251984, "rewards/simpleverify_reward/mean": 0.6629464030265808, "rewards/simpleverify_reward/std": 0.47296738624572754, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3332.0, "completions/mean_length": 693.0982666015625, "completions/mean_terminated_length": 689.2960815429688, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 10.317201166180759, "grad_norm": 0.13757453858852386, "learning_rate": 1e-06, "loss": 0.0189, "num_tokens": 719648756.0, "reward": 0.6495535969734192, "reward_std": 0.11137405782938004, "rewards/simpleverify_reward/mean": 0.6495535969734192, "rewards/simpleverify_reward/std": 0.47737622261047363, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2882.0, "completions/mean_length": 656.4408569335938, "completions/mean_terminated_length": 648.74609375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 10.326530612244898, "grad_norm": 0.15484566986560822, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 720321615.0, "reward": 0.723214328289032, "reward_std": 0.15075109899044037, "rewards/simpleverify_reward/mean": 0.7232142686843872, "rewards/simpleverify_reward/std": 0.44765952229499817, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3035.0, "completions/mean_length": 674.0892944335938, "completions/mean_terminated_length": 647.1451416015625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 10.335860058309038, "grad_norm": 0.13714765012264252, "learning_rate": 1e-06, "loss": 0.0278, "num_tokens": 721010375.0, "reward": 0.6718750596046448, "reward_std": 0.1263321340084076, "rewards/simpleverify_reward/mean": 0.671875, "rewards/simpleverify_reward/std": 0.46979284286499023, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3077.0, "completions/mean_length": 569.154052734375, "completions/mean_terminated_length": 565.21337890625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 10.345189504373177, "grad_norm": 0.16163308918476105, "learning_rate": 1e-06, "loss": 0.0104, "num_tokens": 721606921.0, "reward": 0.7209821939468384, "reward_std": 0.11028818041086197, "rewards/simpleverify_reward/mean": 0.7209821343421936, "rewards/simpleverify_reward/std": 0.448766827583313, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3197.0, "completions/mean_length": 664.3951416015625, "completions/mean_terminated_length": 625.6636962890625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 10.354518950437317, "grad_norm": 0.1432984620332718, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 722286131.0, "reward": 0.715401828289032, "reward_std": 0.1284685730934143, "rewards/simpleverify_reward/mean": 0.7154017686843872, "rewards/simpleverify_reward/std": 0.4514748752117157, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 579.4498291015625, "completions/mean_terminated_length": 567.6360473632812, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 10.363848396501458, "grad_norm": 0.13722288608551025, "learning_rate": 1e-06, "loss": 0.0179, "num_tokens": 722892710.0, "reward": 0.7533482313156128, "reward_std": 0.08713982999324799, "rewards/simpleverify_reward/mean": 0.7533482313156128, "rewards/simpleverify_reward/std": 0.4313030242919922, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3747.0, "completions/mean_length": 675.7734375, "completions/mean_terminated_length": 637.1704711914062, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 10.373177842565598, "grad_norm": 0.16674593091011047, "learning_rate": 1e-06, "loss": 0.0302, "num_tokens": 723583299.0, "reward": 0.6662946939468384, "reward_std": 0.14327579736709595, "rewards/simpleverify_reward/mean": 0.6662946343421936, "rewards/simpleverify_reward/std": 0.47179925441741943, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3571.0, "completions/mean_length": 668.591552734375, "completions/mean_terminated_length": 641.6040649414062, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 10.382507288629737, "grad_norm": 0.19711953401565552, "learning_rate": 1e-06, "loss": 0.0131, "num_tokens": 724256109.0, "reward": 0.6941964626312256, "reward_std": 0.1582231968641281, "rewards/simpleverify_reward/mean": 0.6941964030265808, "rewards/simpleverify_reward/std": 0.4610042870044708, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3961.0, "completions/mean_length": 634.5234375, "completions/mean_terminated_length": 615.0988159179688, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 10.391836734693877, "grad_norm": 0.15869268774986267, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 724910162.0, "reward": 0.746651828289032, "reward_std": 0.11821742355823517, "rewards/simpleverify_reward/mean": 0.7466517686843872, "rewards/simpleverify_reward/std": 0.435171514749527, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3678.0, "completions/mean_length": 712.9319458007812, "completions/mean_terminated_length": 693.947265625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 10.401166180758018, "grad_norm": 0.134503573179245, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 725648701.0, "reward": 0.6674107313156128, "reward_std": 0.11201343685388565, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 611.8125, "completions/mean_terminated_length": 600.1075439453125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 10.410495626822158, "grad_norm": 0.16522261500358582, "learning_rate": 1e-06, "loss": 0.0162, "num_tokens": 726291253.0, "reward": 0.7299107313156128, "reward_std": 0.11242502927780151, "rewards/simpleverify_reward/mean": 0.7299107313156128, "rewards/simpleverify_reward/std": 0.44425368309020996, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2823.0, "completions/mean_length": 661.310302734375, "completions/mean_terminated_length": 642.0359497070312, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 10.419825072886297, "grad_norm": 0.17985355854034424, "learning_rate": 1e-06, "loss": 0.0274, "num_tokens": 726963083.0, "reward": 0.7366071939468384, "reward_std": 0.1573241800069809, "rewards/simpleverify_reward/mean": 0.7366071343421936, "rewards/simpleverify_reward/std": 0.44071969389915466, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3512.0, "completions/mean_length": 681.3995971679688, "completions/mean_terminated_length": 666.0874633789062, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 10.429154518950437, "grad_norm": 0.15784963965415955, "learning_rate": 1e-06, "loss": 0.0176, "num_tokens": 727665873.0, "reward": 0.6361607313156128, "reward_std": 0.15770143270492554, "rewards/simpleverify_reward/mean": 0.6361607313156128, "rewards/simpleverify_reward/std": 0.4813718795776367, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3062.0, "completions/mean_length": 630.3292846679688, "completions/mean_terminated_length": 622.5760498046875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 10.438483965014576, "grad_norm": 0.17319656908512115, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 728310536.0, "reward": 0.7243303656578064, "reward_std": 0.1703290492296219, "rewards/simpleverify_reward/mean": 0.7243303656578064, "rewards/simpleverify_reward/std": 0.4471006691455841, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2569.0, "completions/mean_length": 658.140625, "completions/mean_terminated_length": 642.7242431640625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 10.447813411078718, "grad_norm": 0.14894385635852814, "learning_rate": 1e-06, "loss": 0.0213, "num_tokens": 728991502.0, "reward": 0.6941964626312256, "reward_std": 0.13019424676895142, "rewards/simpleverify_reward/mean": 0.6941964030265808, "rewards/simpleverify_reward/std": 0.4610042870044708, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2843.0, "completions/mean_length": 689.646240234375, "completions/mean_terminated_length": 670.5308837890625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 10.457142857142857, "grad_norm": 0.14386087656021118, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 729702577.0, "reward": 0.6227678656578064, "reward_std": 0.11370521783828735, "rewards/simpleverify_reward/mean": 0.6227678656578064, "rewards/simpleverify_reward/std": 0.4849644899368286, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 631.1082763671875, "completions/mean_terminated_length": 619.4680786132812, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 10.466472303206997, "grad_norm": 0.148788183927536, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 730359426.0, "reward": 0.7187500596046448, "reward_std": 0.10788031667470932, "rewards/simpleverify_reward/mean": 0.71875, "rewards/simpleverify_reward/std": 0.4498603343963623, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 654.3236694335938, "completions/mean_terminated_length": 638.89013671875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 10.475801749271136, "grad_norm": 0.16358958184719086, "learning_rate": 1e-06, "loss": 0.0232, "num_tokens": 731036676.0, "reward": 0.6852678656578064, "reward_std": 0.13211242854595184, "rewards/simpleverify_reward/mean": 0.6852678656578064, "rewards/simpleverify_reward/std": 0.46466848254203796, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 595.4085083007812, "completions/mean_terminated_length": 591.4971923828125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 10.485131195335278, "grad_norm": 0.15175428986549377, "learning_rate": 1e-06, "loss": 0.0115, "num_tokens": 731650794.0, "reward": 0.7500000596046448, "reward_std": 0.11681917309761047, "rewards/simpleverify_reward/mean": 0.75, "rewards/simpleverify_reward/std": 0.43325456976890564, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3633.0, "completions/mean_length": 657.0658569335938, "completions/mean_terminated_length": 645.5128784179688, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 10.494460641399417, "grad_norm": 0.15494970977306366, "learning_rate": 1e-06, "loss": 0.0101, "num_tokens": 732323621.0, "reward": 0.7064732313156128, "reward_std": 0.12587636709213257, "rewards/simpleverify_reward/mean": 0.7064732313156128, "rewards/simpleverify_reward/std": 0.4556320011615753, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3309.0, "completions/mean_length": 648.2042846679688, "completions/mean_terminated_length": 632.7432861328125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 10.503790087463557, "grad_norm": 0.14836889505386353, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 732990316.0, "reward": 0.6495535969734192, "reward_std": 0.11761081963777542, "rewards/simpleverify_reward/mean": 0.6495535969734192, "rewards/simpleverify_reward/std": 0.477376252412796, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2230.0, "completions/mean_length": 609.3739013671875, "completions/mean_terminated_length": 577.9628295898438, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 10.513119533527696, "grad_norm": 0.16624905169010162, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 733616539.0, "reward": 0.7243303656578064, "reward_std": 0.1356835514307022, "rewards/simpleverify_reward/mean": 0.7243303656578064, "rewards/simpleverify_reward/std": 0.4471006691455841, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2481.0, "completions/mean_length": 625.7701416015625, "completions/mean_terminated_length": 621.8927001953125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 10.522448979591836, "grad_norm": 0.16376866400241852, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 734283013.0, "reward": 0.7243303656578064, "reward_std": 0.12189660966396332, "rewards/simpleverify_reward/mean": 0.7243303656578064, "rewards/simpleverify_reward/std": 0.4471006691455841, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2495.0, "completions/mean_length": 573.3928833007812, "completions/mean_terminated_length": 565.5123291015625, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 10.531778425655977, "grad_norm": 0.16763454675674438, "learning_rate": 1e-06, "loss": 0.0151, "num_tokens": 734888613.0, "reward": 0.723214328289032, "reward_std": 0.13432738184928894, "rewards/simpleverify_reward/mean": 0.7232142686843872, "rewards/simpleverify_reward/std": 0.44765952229499817, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2689.0, "completions/mean_length": 605.8426513671875, "completions/mean_terminated_length": 590.1917114257812, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 10.541107871720117, "grad_norm": 0.1739734411239624, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 735532936.0, "reward": 0.7243303656578064, "reward_std": 0.14650851488113403, "rewards/simpleverify_reward/mean": 0.7243303656578064, "rewards/simpleverify_reward/std": 0.4471006691455841, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2661.0, "completions/mean_length": 607.1864013671875, "completions/mean_terminated_length": 599.3814086914062, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 10.550437317784256, "grad_norm": 0.1648685783147812, "learning_rate": 1e-06, "loss": 0.0244, "num_tokens": 736165759.0, "reward": 0.684151828289032, "reward_std": 0.12035568058490753, "rewards/simpleverify_reward/mean": 0.6841517686843872, "rewards/simpleverify_reward/std": 0.4651124179363251, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3984.0, "completions/mean_length": 675.8069458007812, "completions/mean_terminated_length": 652.7494506835938, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 10.559766763848396, "grad_norm": 0.12985879182815552, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 736858194.0, "reward": 0.7410714626312256, "reward_std": 0.10554774105548859, "rewards/simpleverify_reward/mean": 0.7410714030265808, "rewards/simpleverify_reward/std": 0.4382909834384918, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 606.1819458007812, "completions/mean_terminated_length": 590.5325317382812, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 10.569096209912537, "grad_norm": 0.15440747141838074, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 737481797.0, "reward": 0.7008928656578064, "reward_std": 0.10867056250572205, "rewards/simpleverify_reward/mean": 0.7008928656578064, "rewards/simpleverify_reward/std": 0.4581226110458374, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3918.0, "completions/mean_length": 711.9453735351562, "completions/mean_terminated_length": 692.9551391601562, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 10.578425655976677, "grad_norm": 0.1431759148836136, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 738213548.0, "reward": 0.6875000596046448, "reward_std": 0.12846927344799042, "rewards/simpleverify_reward/mean": 0.6875, "rewards/simpleverify_reward/std": 0.4637712836265564, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2295.0, "completions/mean_length": 643.3795166015625, "completions/mean_terminated_length": 612.2747802734375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 10.587755102040816, "grad_norm": 0.15723593533039093, "learning_rate": 1e-06, "loss": 0.0398, "num_tokens": 738888936.0, "reward": 0.7220982313156128, "reward_std": 0.12392357736825943, "rewards/simpleverify_reward/mean": 0.7220982313156128, "rewards/simpleverify_reward/std": 0.44821491837501526, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2753.0, "completions/mean_length": 674.9230346679688, "completions/mean_terminated_length": 655.7250366210938, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 10.597084548104956, "grad_norm": 0.16534721851348877, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 739583499.0, "reward": 0.6752232313156128, "reward_std": 0.16991858184337616, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3640.0, "completions/mean_length": 616.1328125, "completions/mean_terminated_length": 596.60498046875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 10.606413994169095, "grad_norm": 0.1898432821035385, "learning_rate": 1e-06, "loss": 0.0326, "num_tokens": 740220762.0, "reward": 0.7410714626312256, "reward_std": 0.1342952847480774, "rewards/simpleverify_reward/mean": 0.7410714030265808, "rewards/simpleverify_reward/std": 0.43829095363616943, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3757.0, "completions/mean_length": 637.7142944335938, "completions/mean_terminated_length": 622.206298828125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 10.615743440233237, "grad_norm": 0.14832717180252075, "learning_rate": 1e-06, "loss": 0.0062, "num_tokens": 740879882.0, "reward": 0.7220982313156128, "reward_std": 0.11283759027719498, "rewards/simpleverify_reward/mean": 0.7220982313156128, "rewards/simpleverify_reward/std": 0.44821491837501526, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2334.0, "completions/mean_length": 628.138427734375, "completions/mean_terminated_length": 624.263671875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 10.625072886297376, "grad_norm": 0.14940443634986877, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 741539494.0, "reward": 0.6875000596046448, "reward_std": 0.11216428875923157, "rewards/simpleverify_reward/mean": 0.6875, "rewards/simpleverify_reward/std": 0.4637712836265564, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3276.0, "completions/mean_length": 646.4386596679688, "completions/mean_terminated_length": 607.5045166015625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 10.634402332361516, "grad_norm": 0.17394611239433289, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 742210639.0, "reward": 0.7265625596046448, "reward_std": 0.09705325961112976, "rewards/simpleverify_reward/mean": 0.7265625, "rewards/simpleverify_reward/std": 0.4459724426269531, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2892.0, "completions/mean_length": 611.568115234375, "completions/mean_terminated_length": 603.77294921875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 10.643731778425655, "grad_norm": 0.15844014286994934, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 742835740.0, "reward": 0.707589328289032, "reward_std": 0.11945344507694244, "rewards/simpleverify_reward/mean": 0.7075892686843872, "rewards/simpleverify_reward/std": 0.45512402057647705, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2369.0, "completions/mean_length": 697.9241333007812, "completions/mean_terminated_length": 671.1676025390625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 10.653061224489797, "grad_norm": 0.1589617133140564, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 743559656.0, "reward": 0.6674107313156128, "reward_std": 0.15567445755004883, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3817.0, "completions/mean_length": 683.654052734375, "completions/mean_terminated_length": 660.6494750976562, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 10.662390670553936, "grad_norm": 0.1662493646144867, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 744256146.0, "reward": 0.6886160969734192, "reward_std": 0.13970790803432465, "rewards/simpleverify_reward/mean": 0.6886160969734192, "rewards/simpleverify_reward/std": 0.46331799030303955, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3101.0, "completions/mean_length": 667.1763916015625, "completions/mean_terminated_length": 644.0606689453125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 10.671720116618076, "grad_norm": 0.15078675746917725, "learning_rate": 1e-06, "loss": 0.0165, "num_tokens": 744948528.0, "reward": 0.7299107313156128, "reward_std": 0.1264076977968216, "rewards/simpleverify_reward/mean": 0.7299107313156128, "rewards/simpleverify_reward/std": 0.44425368309020996, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3782.0, "completions/mean_length": 640.630615234375, "completions/mean_terminated_length": 613.4229736328125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 10.681049562682215, "grad_norm": 0.1479480266571045, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 745621405.0, "reward": 0.715401828289032, "reward_std": 0.11907436698675156, "rewards/simpleverify_reward/mean": 0.7154017686843872, "rewards/simpleverify_reward/std": 0.4514748752117157, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3065.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 623.2221069335938, "completions/mean_terminated_length": 623.2221069335938, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 10.690379008746355, "grad_norm": 0.1731078326702118, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 746269468.0, "reward": 0.6696428656578064, "reward_std": 0.14008447527885437, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.47060438990592957, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2098.0, "completions/mean_length": 626.3069458007812, "completions/mean_terminated_length": 606.836181640625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 10.699708454810496, "grad_norm": 0.14462612569332123, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 746919695.0, "reward": 0.7120535969734192, "reward_std": 0.11445339024066925, "rewards/simpleverify_reward/mean": 0.7120535969734192, "rewards/simpleverify_reward/std": 0.4530589282512665, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3184.0, "completions/mean_length": 627.1473388671875, "completions/mean_terminated_length": 615.4938354492188, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 10.709037900874636, "grad_norm": 0.1643390953540802, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 747563315.0, "reward": 0.6707589626312256, "reward_std": 0.12125609070062637, "rewards/simpleverify_reward/mean": 0.6707589030265808, "rewards/simpleverify_reward/std": 0.4702001214027405, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3240.0, "completions/mean_length": 653.0949096679688, "completions/mean_terminated_length": 633.7744140625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 10.718367346938775, "grad_norm": 0.14615756273269653, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 748238168.0, "reward": 0.7265625596046448, "reward_std": 0.12941217422485352, "rewards/simpleverify_reward/mean": 0.7265625, "rewards/simpleverify_reward/std": 0.4459724426269531, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2664.0, "completions/mean_length": 657.2076416015625, "completions/mean_terminated_length": 653.3653564453125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 10.727696793002915, "grad_norm": 0.181438609957695, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 748912714.0, "reward": 0.6796875596046448, "reward_std": 0.15500116348266602, "rewards/simpleverify_reward/mean": 0.6796875, "rewards/simpleverify_reward/std": 0.4668572247028351, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2462.0, "completions/mean_length": 624.9475708007812, "completions/mean_terminated_length": 621.0692749023438, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 10.737026239067056, "grad_norm": 0.17405693233013153, "learning_rate": 1e-06, "loss": 0.0165, "num_tokens": 749558195.0, "reward": 0.7254464626312256, "reward_std": 0.142070472240448, "rewards/simpleverify_reward/mean": 0.7254464030265808, "rewards/simpleverify_reward/std": 0.4465382993221283, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3312.0, "completions/mean_length": 690.8158569335938, "completions/mean_terminated_length": 671.7070922851562, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 10.746355685131196, "grad_norm": 0.16064998507499695, "learning_rate": 1e-06, "loss": 0.0317, "num_tokens": 750268446.0, "reward": 0.660714328289032, "reward_std": 0.1734846532344818, "rewards/simpleverify_reward/mean": 0.6607142686843872, "rewards/simpleverify_reward/std": 0.4737313687801361, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3403.0, "completions/mean_length": 657.3125, "completions/mean_terminated_length": 638.0157470703125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 10.755685131195335, "grad_norm": 0.1851307898759842, "learning_rate": 1e-06, "loss": 0.0216, "num_tokens": 750947918.0, "reward": 0.6339285969734192, "reward_std": 0.1472894698381424, "rewards/simpleverify_reward/mean": 0.6339285969734192, "rewards/simpleverify_reward/std": 0.48199838399887085, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3078.0, "completions/mean_length": 661.2421875, "completions/mean_terminated_length": 645.8397216796875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 10.765014577259475, "grad_norm": 0.15234313905239105, "learning_rate": 1e-06, "loss": 0.0156, "num_tokens": 751634311.0, "reward": 0.6662946939468384, "reward_std": 0.13917729258537292, "rewards/simpleverify_reward/mean": 0.6662946343421936, "rewards/simpleverify_reward/std": 0.47179925441741943, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3617.0, "completions/mean_length": 630.044677734375, "completions/mean_terminated_length": 614.5022583007812, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 10.774344023323614, "grad_norm": 0.17668308317661285, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 752290247.0, "reward": 0.7287946939468384, "reward_std": 0.146018847823143, "rewards/simpleverify_reward/mean": 0.7287946343421936, "rewards/simpleverify_reward/std": 0.44483017921447754, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3938.0, "completions/mean_length": 623.2756958007812, "completions/mean_terminated_length": 611.6091918945312, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 10.783673469387756, "grad_norm": 0.16184686124324799, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 752937102.0, "reward": 0.7299107313156128, "reward_std": 0.13362017273902893, "rewards/simpleverify_reward/mean": 0.7299107313156128, "rewards/simpleverify_reward/std": 0.44425368309020996, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3548.0, "completions/mean_length": 627.7277221679688, "completions/mean_terminated_length": 619.9686889648438, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 10.793002915451895, "grad_norm": 0.14595650136470795, "learning_rate": 1e-06, "loss": 0.0382, "num_tokens": 753586850.0, "reward": 0.6707589626312256, "reward_std": 0.11393345147371292, "rewards/simpleverify_reward/mean": 0.6707589030265808, "rewards/simpleverify_reward/std": 0.4702001214027405, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 683.9319458007812, "completions/mean_terminated_length": 649.3111572265625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 10.802332361516035, "grad_norm": 0.14830806851387024, "learning_rate": 1e-06, "loss": 0.0124, "num_tokens": 754290517.0, "reward": 0.6752232313156128, "reward_std": 0.1362883448600769, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3808.0, "completions/mean_length": 642.0814819335938, "completions/mean_terminated_length": 614.8853149414062, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 10.811661807580174, "grad_norm": 0.17018185555934906, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 754964342.0, "reward": 0.7243303656578064, "reward_std": 0.15405867993831635, "rewards/simpleverify_reward/mean": 0.7243303656578064, "rewards/simpleverify_reward/std": 0.4471006691455841, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 647.5491333007812, "completions/mean_terminated_length": 628.1975708007812, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 10.820991253644316, "grad_norm": 0.14622312784194946, "learning_rate": 1e-06, "loss": 0.0131, "num_tokens": 755634330.0, "reward": 0.7098214626312256, "reward_std": 0.126667320728302, "rewards/simpleverify_reward/mean": 0.7098214030265808, "rewards/simpleverify_reward/std": 0.454098105430603, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3922.0, "completions/mean_length": 571.5145263671875, "completions/mean_terminated_length": 547.7539672851562, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 10.830320699708455, "grad_norm": 0.15253780782222748, "learning_rate": 1e-06, "loss": 0.0307, "num_tokens": 756231559.0, "reward": 0.7868303656578064, "reward_std": 0.11791321635246277, "rewards/simpleverify_reward/mean": 0.7868303656578064, "rewards/simpleverify_reward/std": 0.4097752273082733, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3299.0, "completions/mean_length": 668.029052734375, "completions/mean_terminated_length": 652.656982421875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 10.839650145772595, "grad_norm": 0.14921271800994873, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 756923921.0, "reward": 0.6875000596046448, "reward_std": 0.13090920448303223, "rewards/simpleverify_reward/mean": 0.6875, "rewards/simpleverify_reward/std": 0.4637712836265564, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3076.0, "completions/mean_length": 621.5614013671875, "completions/mean_terminated_length": 609.88916015625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 10.848979591836734, "grad_norm": 0.24879232048988342, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 757562888.0, "reward": 0.7109375596046448, "reward_std": 0.15939390659332275, "rewards/simpleverify_reward/mean": 0.7109375, "rewards/simpleverify_reward/std": 0.45358020067214966, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2730.0, "completions/mean_length": 643.1027221679688, "completions/mean_terminated_length": 623.7261962890625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 10.858309037900874, "grad_norm": 0.17947077751159668, "learning_rate": 1e-06, "loss": 0.0288, "num_tokens": 758226732.0, "reward": 0.7165178656578064, "reward_std": 0.152669295668602, "rewards/simpleverify_reward/mean": 0.7165178656578064, "rewards/simpleverify_reward/std": 0.4509401023387909, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2401.0, "completions/max_terminated_length": 2401.0, "completions/mean_length": 585.6652221679688, "completions/mean_terminated_length": 585.6652221679688, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 10.867638483965015, "grad_norm": 0.1871584951877594, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 758841168.0, "reward": 0.7265625596046448, "reward_std": 0.16003581881523132, "rewards/simpleverify_reward/mean": 0.7265625, "rewards/simpleverify_reward/std": 0.4459724426269531, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3794.0, "completions/mean_length": 707.114990234375, "completions/mean_terminated_length": 645.4988403320312, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 10.876967930029155, "grad_norm": 0.1477368324995041, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 759557079.0, "reward": 0.6595982313156128, "reward_std": 0.12279703468084335, "rewards/simpleverify_reward/mean": 0.6595982313156128, "rewards/simpleverify_reward/std": 0.4741089344024658, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2856.0, "completions/mean_length": 625.9051513671875, "completions/mean_terminated_length": 618.14208984375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 10.886297376093294, "grad_norm": 0.1449766904115677, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 760203106.0, "reward": 0.7366071939468384, "reward_std": 0.10228154808282852, "rewards/simpleverify_reward/mean": 0.7366071343421936, "rewards/simpleverify_reward/std": 0.4407196640968323, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3599.0, "completions/mean_length": 649.1796875, "completions/mean_terminated_length": 633.72314453125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 10.895626822157434, "grad_norm": 0.18191972374916077, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 760873243.0, "reward": 0.7555803656578064, "reward_std": 0.14320023357868195, "rewards/simpleverify_reward/mean": 0.7555803656578064, "rewards/simpleverify_reward/std": 0.42998260259628296, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 624.591552734375, "completions/mean_terminated_length": 605.1111450195312, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 10.904956268221575, "grad_norm": 0.15778948366641998, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 761520725.0, "reward": 0.7433035969734192, "reward_std": 0.12186270952224731, "rewards/simpleverify_reward/mean": 0.7433035969734192, "rewards/simpleverify_reward/std": 0.43705442547798157, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3545.0, "completions/mean_length": 582.3995971679688, "completions/mean_terminated_length": 554.7334594726562, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 10.914285714285715, "grad_norm": 0.1544666737318039, "learning_rate": 1e-06, "loss": 0.0142, "num_tokens": 762134307.0, "reward": 0.7522321939468384, "reward_std": 0.11396484076976776, "rewards/simpleverify_reward/mean": 0.7522321343421936, "rewards/simpleverify_reward/std": 0.4319573938846588, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2509.0, "completions/mean_length": 731.0335083007812, "completions/mean_terminated_length": 693.05419921875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 10.923615160349854, "grad_norm": 0.15631835162639618, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 762875409.0, "reward": 0.621651828289032, "reward_std": 0.16037102043628693, "rewards/simpleverify_reward/mean": 0.6216517686843872, "rewards/simpleverify_reward/std": 0.4852459728717804, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 620.9420166015625, "completions/mean_terminated_length": 589.6351318359375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 10.932944606413994, "grad_norm": 0.17450663447380066, "learning_rate": 1e-06, "loss": 0.0335, "num_tokens": 763523021.0, "reward": 0.707589328289032, "reward_std": 0.1575159728527069, "rewards/simpleverify_reward/mean": 0.7075892686843872, "rewards/simpleverify_reward/std": 0.45512402057647705, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2817.0, "completions/mean_length": 647.703125, "completions/mean_terminated_length": 639.9888305664062, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 10.942274052478133, "grad_norm": 0.16475726664066315, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 764199787.0, "reward": 0.6194196939468384, "reward_std": 0.15244106948375702, "rewards/simpleverify_reward/mean": 0.6194196343421936, "rewards/simpleverify_reward/std": 0.48580074310302734, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3085.0, "completions/mean_length": 702.0714721679688, "completions/mean_terminated_length": 671.4954833984375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 10.951603498542275, "grad_norm": 0.1438385397195816, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 764919227.0, "reward": 0.6506696939468384, "reward_std": 0.1244862824678421, "rewards/simpleverify_reward/mean": 0.6506696343421936, "rewards/simpleverify_reward/std": 0.47702476382255554, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3853.0, "completions/mean_length": 754.7813110351562, "completions/mean_terminated_length": 705.5900268554688, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 10.960932944606414, "grad_norm": 0.15813684463500977, "learning_rate": 1e-06, "loss": 0.0277, "num_tokens": 765677039.0, "reward": 0.676339328289032, "reward_std": 0.15401588380336761, "rewards/simpleverify_reward/mean": 0.6763392686843872, "rewards/simpleverify_reward/std": 0.4681335985660553, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 646.2433471679688, "completions/mean_terminated_length": 638.5257568359375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 10.970262390670554, "grad_norm": 0.15871421992778778, "learning_rate": 1e-06, "loss": 0.0149, "num_tokens": 766341489.0, "reward": 0.6741071939468384, "reward_std": 0.13362017273902893, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2906.0, "completions/max_terminated_length": 2906.0, "completions/mean_length": 608.953125, "completions/mean_terminated_length": 608.953125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 10.979591836734693, "grad_norm": 0.1490984708070755, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 766980111.0, "reward": 0.7444196939468384, "reward_std": 0.11032096296548843, "rewards/simpleverify_reward/mean": 0.7444196343421936, "rewards/simpleverify_reward/std": 0.43643057346343994, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3183.0, "completions/mean_length": 628.411865234375, "completions/mean_terminated_length": 593.2277221679688, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 10.988921282798835, "grad_norm": 0.16744522750377655, "learning_rate": 1e-06, "loss": 0.0236, "num_tokens": 767633048.0, "reward": 0.7399553656578064, "reward_std": 0.13470645248889923, "rewards/simpleverify_reward/mean": 0.7399553656578064, "rewards/simpleverify_reward/std": 0.43890365958213806, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008522727272727293, "completions/max_length": 4096.0, "completions/max_terminated_length": 3104.0, "completions/mean_length": 715.1818237304688, "completions/mean_terminated_length": 686.120361328125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 10.998250728862974, "grad_norm": 0.13844981789588928, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 768343153.0, "reward": 0.621651828289032, "reward_std": 0.13865412771701813, "rewards/simpleverify_reward/mean": 0.6216517686843872, "rewards/simpleverify_reward/std": 0.4852459728717804, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 696.3359985351562, "completions/mean_terminated_length": 665.7083740234375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 11.00932944606414, "grad_norm": 0.14230172336101532, "learning_rate": 1e-06, "loss": 0.0328, "num_tokens": 769060766.0, "reward": 0.7053571939468384, "reward_std": 0.13673663139343262, "rewards/simpleverify_reward/mean": 0.7053571343421936, "rewards/simpleverify_reward/std": 0.45613667368888855, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3532.0, "completions/mean_length": 767.6942138671875, "completions/mean_terminated_length": 718.6930541992188, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 11.018658892128279, "grad_norm": 0.12969936430454254, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 769840092.0, "reward": 0.6506696939468384, "reward_std": 0.11227376759052277, "rewards/simpleverify_reward/mean": 0.6506696343421936, "rewards/simpleverify_reward/std": 0.47702476382255554, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3960.0, "completions/mean_length": 623.7935791015625, "completions/mean_terminated_length": 604.3086547851562, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 11.02798833819242, "grad_norm": 0.20437020063400269, "learning_rate": 1e-06, "loss": 0.0194, "num_tokens": 770480379.0, "reward": 0.6908482313156128, "reward_std": 0.17656832933425903, "rewards/simpleverify_reward/mean": 0.6908482313156128, "rewards/simpleverify_reward/std": 0.46240198612213135, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3804.0, "completions/mean_length": 678.1160888671875, "completions/mean_terminated_length": 627.796142578125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 11.03731778425656, "grad_norm": 0.1329992115497589, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 771174123.0, "reward": 0.6830357313156128, "reward_std": 0.13301284611225128, "rewards/simpleverify_reward/mean": 0.6830357313156128, "rewards/simpleverify_reward/std": 0.46555325388908386, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2646.0, "completions/mean_length": 697.1585083007812, "completions/mean_terminated_length": 658.796875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 11.0466472303207, "grad_norm": 0.1466340571641922, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 771885241.0, "reward": 0.6897321939468384, "reward_std": 0.1277952641248703, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3952.0, "completions/mean_length": 643.5301513671875, "completions/mean_terminated_length": 628.0482177734375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 11.055976676384839, "grad_norm": 0.16810737550258636, "learning_rate": 1e-06, "loss": 0.0405, "num_tokens": 772551228.0, "reward": 0.6986607313156128, "reward_std": 0.12422595918178558, "rewards/simpleverify_reward/mean": 0.6986607313156128, "rewards/simpleverify_reward/std": 0.4590960443019867, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3466.0, "completions/mean_length": 696.5033569335938, "completions/mean_terminated_length": 646.4541015625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 11.06530612244898, "grad_norm": 0.16963209211826324, "learning_rate": 1e-06, "loss": 0.0262, "num_tokens": 773271231.0, "reward": 0.6640625, "reward_std": 0.1565753072500229, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 646.1741333007812, "completions/mean_terminated_length": 630.7040405273438, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 11.07463556851312, "grad_norm": 0.17759227752685547, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 773939755.0, "reward": 0.6729910969734192, "reward_std": 0.1693519502878189, "rewards/simpleverify_reward/mean": 0.6729910969734192, "rewards/simpleverify_reward/std": 0.46938255429267883, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 681.388427734375, "completions/mean_terminated_length": 642.8487548828125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 11.08396501457726, "grad_norm": 0.14979995787143707, "learning_rate": 1e-06, "loss": 0.022, "num_tokens": 774645207.0, "reward": 0.6484375, "reward_std": 0.13763383030891418, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4080.0, "completions/mean_length": 612.421875, "completions/mean_terminated_length": 584.9921264648438, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 11.093294460641399, "grad_norm": 0.17057107388973236, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 775287009.0, "reward": 0.7165178656578064, "reward_std": 0.10908354818820953, "rewards/simpleverify_reward/mean": 0.7165178656578064, "rewards/simpleverify_reward/std": 0.4509401023387909, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 629.859375, "completions/mean_terminated_length": 622.1051635742188, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 11.102623906705539, "grad_norm": 0.17434731125831604, "learning_rate": 1e-06, "loss": 0.0283, "num_tokens": 775951371.0, "reward": 0.6696428656578064, "reward_std": 0.14507704973220825, "rewards/simpleverify_reward/mean": 0.6696428656578064, "rewards/simpleverify_reward/std": 0.47060438990592957, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3838.0, "completions/mean_length": 599.1629638671875, "completions/mean_terminated_length": 583.4821166992188, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 11.11195335276968, "grad_norm": 0.19523712992668152, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 776563565.0, "reward": 0.7477678656578064, "reward_std": 0.1159176379442215, "rewards/simpleverify_reward/mean": 0.7477678656578064, "rewards/simpleverify_reward/std": 0.434536337852478, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 705.2199096679688, "completions/mean_terminated_length": 651.39794921875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 11.12128279883382, "grad_norm": 0.1290528029203415, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 777290218.0, "reward": 0.6886160969734192, "reward_std": 0.10949723422527313, "rewards/simpleverify_reward/mean": 0.6886160969734192, "rewards/simpleverify_reward/std": 0.46331802010536194, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3417.0, "completions/mean_length": 633.46875, "completions/mean_terminated_length": 610.1258544921875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 11.130612244897959, "grad_norm": 0.1294921636581421, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 777934878.0, "reward": 0.7209821939468384, "reward_std": 0.10123056918382645, "rewards/simpleverify_reward/mean": 0.7209821343421936, "rewards/simpleverify_reward/std": 0.448766827583313, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3113.0, "completions/mean_length": 699.0949096679688, "completions/mean_terminated_length": 687.68310546875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 11.139941690962099, "grad_norm": 0.13367220759391785, "learning_rate": 1e-06, "loss": 0.0151, "num_tokens": 778664299.0, "reward": 0.6104910969734192, "reward_std": 0.0921299010515213, "rewards/simpleverify_reward/mean": 0.6104910969734192, "rewards/simpleverify_reward/std": 0.48791128396987915, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3940.0, "completions/mean_length": 662.2444458007812, "completions/mean_terminated_length": 635.2069702148438, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 11.14927113702624, "grad_norm": 0.17730775475502014, "learning_rate": 1e-06, "loss": 0.042, "num_tokens": 779341694.0, "reward": 0.684151828289032, "reward_std": 0.16146688163280487, "rewards/simpleverify_reward/mean": 0.6841517686843872, "rewards/simpleverify_reward/std": 0.4651124179363251, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 632.0692138671875, "completions/mean_terminated_length": 604.794189453125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 11.15860058309038, "grad_norm": 0.15681475400924683, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 779992524.0, "reward": 0.7578125596046448, "reward_std": 0.12189549207687378, "rewards/simpleverify_reward/mean": 0.7578125, "rewards/simpleverify_reward/std": 0.428646445274353, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3420.0, "completions/mean_length": 627.3225708007812, "completions/mean_terminated_length": 600.0101318359375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 11.167930029154519, "grad_norm": 0.1523633897304535, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 780656573.0, "reward": 0.7042410969734192, "reward_std": 0.12305524200201035, "rewards/simpleverify_reward/mean": 0.7042410969734192, "rewards/simpleverify_reward/std": 0.45663803815841675, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3639.0, "completions/mean_length": 656.669677734375, "completions/mean_terminated_length": 645.1153564453125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 11.177259475218658, "grad_norm": 0.15176145732402802, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 781323693.0, "reward": 0.7131696939468384, "reward_std": 0.12505152821540833, "rewards/simpleverify_reward/mean": 0.7131696343421936, "rewards/simpleverify_reward/std": 0.4525342881679535, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 584.372802734375, "completions/mean_terminated_length": 576.5167846679688, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 11.186588921282798, "grad_norm": 0.15377968549728394, "learning_rate": 1e-06, "loss": 0.0272, "num_tokens": 781933947.0, "reward": 0.7511160969734192, "reward_std": 0.10719674825668335, "rewards/simpleverify_reward/mean": 0.7511160969734192, "rewards/simpleverify_reward/std": 0.43260788917541504, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3311.0, "completions/mean_length": 626.1830444335938, "completions/mean_terminated_length": 618.4205932617188, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 11.19591836734694, "grad_norm": 0.1405356079339981, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 782579423.0, "reward": 0.7120535969734192, "reward_std": 0.09683641791343689, "rewards/simpleverify_reward/mean": 0.7120535969734192, "rewards/simpleverify_reward/std": 0.4530589282512665, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 632.2366333007812, "completions/mean_terminated_length": 604.962890625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 11.205247813411079, "grad_norm": 0.14520321786403656, "learning_rate": 1e-06, "loss": 0.0196, "num_tokens": 783241619.0, "reward": 0.6930803656578064, "reward_std": 0.11122109740972519, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147337555885315, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 4069.0, "completions/mean_length": 628.3035888671875, "completions/mean_terminated_length": 620.5458374023438, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 11.214577259475218, "grad_norm": 0.17637258768081665, "learning_rate": 1e-06, "loss": 0.0239, "num_tokens": 783899787.0, "reward": 0.7243303656578064, "reward_std": 0.17100416123867035, "rewards/simpleverify_reward/mean": 0.7243303656578064, "rewards/simpleverify_reward/std": 0.4471006691455841, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3830.0, "completions/mean_length": 680.2701416015625, "completions/mean_terminated_length": 653.3746337890625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 11.223906705539358, "grad_norm": 0.16120226681232452, "learning_rate": 1e-06, "loss": 0.0216, "num_tokens": 784592005.0, "reward": 0.6752232313156128, "reward_std": 0.13279713690280914, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 630.7176513671875, "completions/mean_terminated_length": 626.8457641601562, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 11.2332361516035, "grad_norm": 0.12519434094429016, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 785244216.0, "reward": 0.7321428656578064, "reward_std": 0.0986715778708458, "rewards/simpleverify_reward/mean": 0.7321428656578064, "rewards/simpleverify_reward/std": 0.4430900514125824, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3994.0, "completions/mean_length": 631.6652221679688, "completions/mean_terminated_length": 608.3101196289062, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 11.242565597667639, "grad_norm": 0.14953017234802246, "learning_rate": 1e-06, "loss": 0.0265, "num_tokens": 785896620.0, "reward": 0.7477678656578064, "reward_std": 0.10708867013454437, "rewards/simpleverify_reward/mean": 0.7477678656578064, "rewards/simpleverify_reward/std": 0.4345363676548004, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 640.71875, "completions/mean_terminated_length": 613.5118408203125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 11.251895043731778, "grad_norm": 0.15109001100063324, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 786563856.0, "reward": 0.7611607313156128, "reward_std": 0.1293361932039261, "rewards/simpleverify_reward/mean": 0.7611607313156128, "rewards/simpleverify_reward/std": 0.4266124963760376, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3376.0, "completions/mean_length": 673.3348388671875, "completions/mean_terminated_length": 665.6778564453125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 11.261224489795918, "grad_norm": 0.1688837856054306, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 787276724.0, "reward": 0.6261160969734192, "reward_std": 0.15608815848827362, "rewards/simpleverify_reward/mean": 0.6261160969734192, "rewards/simpleverify_reward/std": 0.48410359025001526, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 534.5748291015625, "completions/mean_terminated_length": 522.6102905273438, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 11.270553935860057, "grad_norm": 0.16717830300331116, "learning_rate": 1e-06, "loss": 0.0205, "num_tokens": 787846639.0, "reward": 0.7087053656578064, "reward_std": 0.13798151910305023, "rewards/simpleverify_reward/mean": 0.7087053656578064, "rewards/simpleverify_reward/std": 0.45461276173591614, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2526.0, "completions/mean_length": 619.3392944335938, "completions/mean_terminated_length": 595.901123046875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 11.279883381924199, "grad_norm": 0.16265946626663208, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 788487695.0, "reward": 0.7165178656578064, "reward_std": 0.15015265345573425, "rewards/simpleverify_reward/mean": 0.7165178656578064, "rewards/simpleverify_reward/std": 0.4509401023387909, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3714.0, "completions/mean_length": 597.677490234375, "completions/mean_terminated_length": 593.7686767578125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 11.289212827988338, "grad_norm": 0.14138659834861755, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 789116182.0, "reward": 0.684151828289032, "reward_std": 0.11430183798074722, "rewards/simpleverify_reward/mean": 0.6841517686843872, "rewards/simpleverify_reward/std": 0.4651124179363251, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 579.8850708007812, "completions/mean_terminated_length": 572.01904296875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 11.298542274052478, "grad_norm": 0.17249111831188202, "learning_rate": 1e-06, "loss": 0.0282, "num_tokens": 789723447.0, "reward": 0.6986607313156128, "reward_std": 0.13373145461082458, "rewards/simpleverify_reward/mean": 0.6986607313156128, "rewards/simpleverify_reward/std": 0.4590960443019867, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3233.0, "completions/mean_length": 557.1663208007812, "completions/mean_terminated_length": 541.297119140625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 11.307871720116617, "grad_norm": 0.1589939445257187, "learning_rate": 1e-06, "loss": 0.0314, "num_tokens": 790301924.0, "reward": 0.785714328289032, "reward_std": 0.10660012811422348, "rewards/simpleverify_reward/mean": 0.7857142686843872, "rewards/simpleverify_reward/std": 0.41055506467819214, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3409.0, "completions/mean_length": 716.0547485351562, "completions/mean_terminated_length": 689.4409790039062, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 11.317201166180759, "grad_norm": 0.16370826959609985, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 791043725.0, "reward": 0.6484375, "reward_std": 0.14692038297653198, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2704.0, "completions/mean_length": 630.286865234375, "completions/mean_terminated_length": 599.064208984375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 11.326530612244898, "grad_norm": 0.17143958806991577, "learning_rate": 1e-06, "loss": 0.0252, "num_tokens": 791694966.0, "reward": 0.6975446939468384, "reward_std": 0.12287119030952454, "rewards/simpleverify_reward/mean": 0.6975446343421936, "rewards/simpleverify_reward/std": 0.45957788825035095, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2473.0, "completions/max_terminated_length": 2473.0, "completions/mean_length": 573.6551513671875, "completions/mean_terminated_length": 573.6551513671875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 11.335860058309038, "grad_norm": 0.15876738727092743, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 792286289.0, "reward": 0.7511160969734192, "reward_std": 0.10487444698810577, "rewards/simpleverify_reward/mean": 0.7511160969734192, "rewards/simpleverify_reward/std": 0.43260788917541504, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2440.0, "completions/mean_length": 602.9420166015625, "completions/mean_terminated_length": 591.2072143554688, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 11.345189504373177, "grad_norm": 0.17977295815944672, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 792914741.0, "reward": 0.691964328289032, "reward_std": 0.12189590930938721, "rewards/simpleverify_reward/mean": 0.6919642686843872, "rewards/simpleverify_reward/std": 0.4619392454624176, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 665.1920166015625, "completions/mean_terminated_length": 649.8071899414062, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 11.354518950437317, "grad_norm": 0.1546149104833603, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 793605209.0, "reward": 0.7243303656578064, "reward_std": 0.12621267139911652, "rewards/simpleverify_reward/mean": 0.7243303656578064, "rewards/simpleverify_reward/std": 0.4471006691455841, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3570.0, "completions/mean_length": 686.560302734375, "completions/mean_terminated_length": 655.8446044921875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 11.363848396501458, "grad_norm": 0.14414334297180176, "learning_rate": 1e-06, "loss": 0.0286, "num_tokens": 794302191.0, "reward": 0.668526828289032, "reward_std": 0.13290590047836304, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2272.0, "completions/mean_length": 604.8939819335938, "completions/mean_terminated_length": 581.3584594726562, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 11.373177842565598, "grad_norm": 0.1640087217092514, "learning_rate": 1e-06, "loss": 0.0391, "num_tokens": 794944424.0, "reward": 0.6897321939468384, "reward_std": 0.13996823132038116, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2532.0, "completions/mean_length": 658.9475708007812, "completions/mean_terminated_length": 639.6599731445312, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 11.382507288629737, "grad_norm": 0.14472778141498566, "learning_rate": 1e-06, "loss": 0.0285, "num_tokens": 795629857.0, "reward": 0.7321428656578064, "reward_std": 0.13087712228298187, "rewards/simpleverify_reward/mean": 0.7321428656578064, "rewards/simpleverify_reward/std": 0.4430900514125824, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3286.0, "completions/mean_length": 602.0982666015625, "completions/mean_terminated_length": 578.5438232421875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 11.391836734693877, "grad_norm": 0.16720043122768402, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 796259361.0, "reward": 0.7042410969734192, "reward_std": 0.13004271686077118, "rewards/simpleverify_reward/mean": 0.7042410969734192, "rewards/simpleverify_reward/std": 0.45663803815841675, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2665.0, "completions/mean_length": 604.9765625, "completions/mean_terminated_length": 593.2485961914062, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 11.401166180758018, "grad_norm": 0.17604251205921173, "learning_rate": 1e-06, "loss": 0.0132, "num_tokens": 796889324.0, "reward": 0.7064732313156128, "reward_std": 0.14567367732524872, "rewards/simpleverify_reward/mean": 0.7064732313156128, "rewards/simpleverify_reward/std": 0.4556320011615753, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2562.0, "completions/mean_length": 678.841552734375, "completions/mean_terminated_length": 632.4547729492188, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 11.410495626822158, "grad_norm": 0.15326300263404846, "learning_rate": 1e-06, "loss": 0.0313, "num_tokens": 797579966.0, "reward": 0.6618303656578064, "reward_std": 0.1379726380109787, "rewards/simpleverify_reward/mean": 0.6618303656578064, "rewards/simpleverify_reward/std": 0.4733508229255676, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3952.0, "completions/mean_length": 645.3381958007812, "completions/mean_terminated_length": 633.745849609375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 11.419825072886297, "grad_norm": 0.15465842187404633, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 798245581.0, "reward": 0.746651828289032, "reward_std": 0.12200538069009781, "rewards/simpleverify_reward/mean": 0.7466517686843872, "rewards/simpleverify_reward/std": 0.435171514749527, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3690.0, "completions/mean_length": 696.4498291015625, "completions/mean_terminated_length": 685.0291137695312, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 11.429154518950437, "grad_norm": 0.16637495160102844, "learning_rate": 1e-06, "loss": 0.0215, "num_tokens": 798949704.0, "reward": 0.6640625, "reward_std": 0.14064012467861176, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2479.0, "completions/mean_length": 575.7176513671875, "completions/mean_terminated_length": 551.9854125976562, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 11.438483965014576, "grad_norm": 0.18470041453838348, "learning_rate": 1e-06, "loss": 0.0295, "num_tokens": 799549691.0, "reward": 0.7511160969734192, "reward_std": 0.12918464839458466, "rewards/simpleverify_reward/mean": 0.7511160969734192, "rewards/simpleverify_reward/std": 0.43260788917541504, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3997.0, "completions/mean_length": 664.2020263671875, "completions/mean_terminated_length": 641.0662841796875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 11.447813411078718, "grad_norm": 0.14015157520771027, "learning_rate": 1e-06, "loss": 0.0179, "num_tokens": 800236776.0, "reward": 0.660714328289032, "reward_std": 0.11524616181850433, "rewards/simpleverify_reward/mean": 0.6607142686843872, "rewards/simpleverify_reward/std": 0.4737313687801361, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2827.0, "completions/mean_length": 604.7511596679688, "completions/mean_terminated_length": 596.9407348632812, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 11.457142857142857, "grad_norm": 0.15339545905590057, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 800855049.0, "reward": 0.7321428656578064, "reward_std": 0.13234136998653412, "rewards/simpleverify_reward/mean": 0.7321428656578064, "rewards/simpleverify_reward/std": 0.4430900514125824, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2622.0, "completions/mean_length": 606.052490234375, "completions/mean_terminated_length": 586.468017578125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 11.466472303206997, "grad_norm": 0.14035794138908386, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 801480080.0, "reward": 0.7220982313156128, "reward_std": 0.09761849790811539, "rewards/simpleverify_reward/mean": 0.7220982313156128, "rewards/simpleverify_reward/std": 0.44821488857269287, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3921.0, "completions/mean_length": 645.7277221679688, "completions/mean_terminated_length": 634.1366577148438, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 11.475801749271136, "grad_norm": 0.15566565096378326, "learning_rate": 1e-06, "loss": 0.0266, "num_tokens": 802143252.0, "reward": 0.6785714626312256, "reward_std": 0.1465080827474594, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2832.0, "completions/mean_length": 590.716552734375, "completions/mean_terminated_length": 559.1373901367188, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 11.485131195335278, "grad_norm": 0.16868524253368378, "learning_rate": 1e-06, "loss": 0.0358, "num_tokens": 802744606.0, "reward": 0.7265625596046448, "reward_std": 0.12610390782356262, "rewards/simpleverify_reward/mean": 0.7265625, "rewards/simpleverify_reward/std": 0.4459724426269531, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3400.0, "completions/mean_length": 670.7433471679688, "completions/mean_terminated_length": 624.2466430664062, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 11.494460641399417, "grad_norm": 0.18784180283546448, "learning_rate": 1e-06, "loss": 0.033, "num_tokens": 803437448.0, "reward": 0.7276785969734192, "reward_std": 0.14977401494979858, "rewards/simpleverify_reward/mean": 0.7276785969734192, "rewards/simpleverify_reward/std": 0.4454030692577362, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3552.0, "completions/mean_length": 653.3013916015625, "completions/mean_terminated_length": 630.0921630859375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 11.503790087463557, "grad_norm": 0.14360438287258148, "learning_rate": 1e-06, "loss": 0.0252, "num_tokens": 804116070.0, "reward": 0.6986607313156128, "reward_std": 0.1277543008327484, "rewards/simpleverify_reward/mean": 0.6986607313156128, "rewards/simpleverify_reward/std": 0.4590960443019867, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2868.0, "completions/mean_length": 603.5971069335938, "completions/mean_terminated_length": 595.7841186523438, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 11.513119533527696, "grad_norm": 0.17632661759853363, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 804748189.0, "reward": 0.6573660969734192, "reward_std": 0.14515303075313568, "rewards/simpleverify_reward/mean": 0.6573660969734192, "rewards/simpleverify_reward/std": 0.47485533356666565, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2666.0, "completions/mean_length": 611.8705444335938, "completions/mean_terminated_length": 596.2466430664062, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 11.522448979591836, "grad_norm": 0.1577792763710022, "learning_rate": 1e-06, "loss": 0.0169, "num_tokens": 805378865.0, "reward": 0.7265625596046448, "reward_std": 0.11922521889209747, "rewards/simpleverify_reward/mean": 0.7265625, "rewards/simpleverify_reward/std": 0.4459724426269531, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3163.0, "completions/mean_length": 672.9933471679688, "completions/mean_terminated_length": 646.04052734375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 11.531778425655977, "grad_norm": 0.15231746435165405, "learning_rate": 1e-06, "loss": 0.0261, "num_tokens": 806073347.0, "reward": 0.6941964626312256, "reward_std": 0.14338386058807373, "rewards/simpleverify_reward/mean": 0.6941964030265808, "rewards/simpleverify_reward/std": 0.4610042870044708, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 698.5748291015625, "completions/mean_terminated_length": 660.2291259765625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 11.541107871720117, "grad_norm": 0.1592981368303299, "learning_rate": 1e-06, "loss": 0.029, "num_tokens": 806787398.0, "reward": 0.6629464626312256, "reward_std": 0.14409995079040527, "rewards/simpleverify_reward/mean": 0.6629464030265808, "rewards/simpleverify_reward/std": 0.47296738624572754, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3606.0, "completions/mean_length": 673.7835083007812, "completions/mean_terminated_length": 635.1580200195312, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 11.550437317784256, "grad_norm": 0.13674083352088928, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 807475340.0, "reward": 0.7165178656578064, "reward_std": 0.10085079073905945, "rewards/simpleverify_reward/mean": 0.7165178656578064, "rewards/simpleverify_reward/std": 0.4509401023387909, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3322.0, "completions/mean_length": 633.3236694335938, "completions/mean_terminated_length": 617.7960205078125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 11.559766763848396, "grad_norm": 0.17991995811462402, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 808147846.0, "reward": 0.699776828289032, "reward_std": 0.14830933511257172, "rewards/simpleverify_reward/mean": 0.6997767686843872, "rewards/simpleverify_reward/std": 0.4586109220981598, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 657.224365234375, "completions/mean_terminated_length": 634.0415649414062, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 11.569096209912537, "grad_norm": 0.16613511741161346, "learning_rate": 1e-06, "loss": 0.0264, "num_tokens": 808824919.0, "reward": 0.7087053656578064, "reward_std": 0.1497747153043747, "rewards/simpleverify_reward/mean": 0.7087053656578064, "rewards/simpleverify_reward/std": 0.45461276173591614, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3119.0, "completions/mean_length": 597.5982666015625, "completions/mean_terminated_length": 585.845458984375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 11.578425655976677, "grad_norm": 0.14625701308250427, "learning_rate": 1e-06, "loss": 0.0246, "num_tokens": 809447815.0, "reward": 0.7656250596046448, "reward_std": 0.10190316289663315, "rewards/simpleverify_reward/mean": 0.765625, "rewards/simpleverify_reward/std": 0.4238441288471222, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3281.0, "completions/mean_length": 643.7310791015625, "completions/mean_terminated_length": 620.4573364257812, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 11.587755102040816, "grad_norm": 0.11649726331233978, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 810118950.0, "reward": 0.6908482313156128, "reward_std": 0.0891667902469635, "rewards/simpleverify_reward/mean": 0.6908482313156128, "rewards/simpleverify_reward/std": 0.46240198612213135, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2269.0, "completions/mean_length": 649.849365234375, "completions/mean_terminated_length": 618.8029174804688, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 11.597084548104956, "grad_norm": 0.15361373126506805, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 810806383.0, "reward": 0.6484375, "reward_std": 0.13835102319717407, "rewards/simpleverify_reward/mean": 0.6484375, "rewards/simpleverify_reward/std": 0.4777248501777649, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3773.0, "completions/mean_length": 673.3683471679688, "completions/mean_terminated_length": 650.2943725585938, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 11.606413994169095, "grad_norm": 0.19781997799873352, "learning_rate": 1e-06, "loss": 0.0358, "num_tokens": 811512201.0, "reward": 0.625, "reward_std": 0.17551638185977936, "rewards/simpleverify_reward/mean": 0.625, "rewards/simpleverify_reward/std": 0.48439329862594604, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2444.0, "completions/mean_length": 640.3616333007812, "completions/mean_terminated_length": 628.7525634765625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 11.615743440233237, "grad_norm": 0.14386917650699615, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 812174525.0, "reward": 0.7098214626312256, "reward_std": 0.12692946195602417, "rewards/simpleverify_reward/mean": 0.7098214030265808, "rewards/simpleverify_reward/std": 0.454098105430603, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 4012.0, "completions/mean_length": 608.3928833007812, "completions/mean_terminated_length": 600.5906372070312, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 11.625072886297376, "grad_norm": 0.1444224715232849, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 812807853.0, "reward": 0.7756696939468384, "reward_std": 0.09964797645807266, "rewards/simpleverify_reward/mean": 0.7756696343421936, "rewards/simpleverify_reward/std": 0.41737356781959534, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3498.0, "completions/mean_length": 633.911865234375, "completions/mean_terminated_length": 626.1666870117188, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 11.634402332361516, "grad_norm": 0.1534484326839447, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 813463510.0, "reward": 0.6930803656578064, "reward_std": 0.14015047252178192, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147334575653076, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2842.0, "completions/mean_length": 651.1998291015625, "completions/mean_terminated_length": 620.16552734375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 11.643731778425655, "grad_norm": 0.15286573767662048, "learning_rate": 1e-06, "loss": 0.0329, "num_tokens": 814135329.0, "reward": 0.6707589626312256, "reward_std": 0.13000810146331787, "rewards/simpleverify_reward/mean": 0.6707589030265808, "rewards/simpleverify_reward/std": 0.4702001214027405, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 610.7377319335938, "completions/mean_terminated_length": 591.1796264648438, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 11.653061224489797, "grad_norm": 0.16647876799106598, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 814771734.0, "reward": 0.7399553656578064, "reward_std": 0.13200436532497406, "rewards/simpleverify_reward/mean": 0.7399553656578064, "rewards/simpleverify_reward/std": 0.43890365958213806, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2688.0, "completions/mean_length": 619.8092041015625, "completions/mean_terminated_length": 592.4376220703125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 11.662390670553936, "grad_norm": 0.16448190808296204, "learning_rate": 1e-06, "loss": 0.0418, "num_tokens": 815407195.0, "reward": 0.715401828289032, "reward_std": 0.12170863896608353, "rewards/simpleverify_reward/mean": 0.7154017686843872, "rewards/simpleverify_reward/std": 0.4514748752117157, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3586.0, "completions/mean_length": 658.5067138671875, "completions/mean_terminated_length": 631.4398193359375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 11.671720116618076, "grad_norm": 0.16846050322055817, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 816081801.0, "reward": 0.746651828289032, "reward_std": 0.14440485835075378, "rewards/simpleverify_reward/mean": 0.7466517686843872, "rewards/simpleverify_reward/std": 0.435171514749527, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 663.3035888671875, "completions/mean_terminated_length": 624.559814453125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 11.681049562682215, "grad_norm": 0.15185964107513428, "learning_rate": 1e-06, "loss": 0.0272, "num_tokens": 816767793.0, "reward": 0.668526828289032, "reward_std": 0.14056484401226044, "rewards/simpleverify_reward/mean": 0.6685267686843872, "rewards/simpleverify_reward/std": 0.4710056483745575, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2313.0, "completions/mean_length": 681.359375, "completions/mean_terminated_length": 662.1975708007812, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 11.690379008746355, "grad_norm": 0.15698765218257904, "learning_rate": 1e-06, "loss": 0.0204, "num_tokens": 817470283.0, "reward": 0.6506696939468384, "reward_std": 0.1466914266347885, "rewards/simpleverify_reward/mean": 0.6506696343421936, "rewards/simpleverify_reward/std": 0.47702476382255554, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 620.5692138671875, "completions/mean_terminated_length": 616.68603515625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 11.699708454810496, "grad_norm": 0.13617822527885437, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 818122753.0, "reward": 0.7511160969734192, "reward_std": 0.10926718264818192, "rewards/simpleverify_reward/mean": 0.7511160969734192, "rewards/simpleverify_reward/std": 0.43260788917541504, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3862.0, "completions/mean_length": 686.591552734375, "completions/mean_terminated_length": 675.1377563476562, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 11.709037900874636, "grad_norm": 0.16558538377285004, "learning_rate": 1e-06, "loss": 0.0333, "num_tokens": 818824867.0, "reward": 0.7198660969734192, "reward_std": 0.13267837464809418, "rewards/simpleverify_reward/mean": 0.7198660969734192, "rewards/simpleverify_reward/std": 0.44931530952453613, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 612.8515625, "completions/mean_terminated_length": 605.0592651367188, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 11.718367346938775, "grad_norm": 0.15257282555103302, "learning_rate": 1e-06, "loss": 0.0179, "num_tokens": 819463574.0, "reward": 0.6953125596046448, "reward_std": 0.12497595697641373, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4021.0, "completions/mean_length": 689.4642944335938, "completions/mean_terminated_length": 666.4989013671875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 11.727696793002915, "grad_norm": 0.16344113647937775, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 820176446.0, "reward": 0.6941964626312256, "reward_std": 0.1356828510761261, "rewards/simpleverify_reward/mean": 0.6941964030265808, "rewards/simpleverify_reward/std": 0.4610042870044708, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3933.0, "completions/mean_length": 648.6350708007812, "completions/mean_terminated_length": 617.5776977539062, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 11.737026239067056, "grad_norm": 0.14683431386947632, "learning_rate": 1e-06, "loss": 0.0215, "num_tokens": 820849655.0, "reward": 0.7366071939468384, "reward_std": 0.10025347024202347, "rewards/simpleverify_reward/mean": 0.7366071343421936, "rewards/simpleverify_reward/std": 0.4407196640968323, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3531.0, "completions/mean_length": 654.6796875, "completions/mean_terminated_length": 627.5827026367188, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 11.746355685131196, "grad_norm": 0.14175643026828766, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 821527096.0, "reward": 0.6852678656578064, "reward_std": 0.11129847913980484, "rewards/simpleverify_reward/mean": 0.6852678656578064, "rewards/simpleverify_reward/std": 0.46466848254203796, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3588.0, "completions/mean_length": 698.2142944335938, "completions/mean_terminated_length": 648.1902465820312, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 11.755685131195335, "grad_norm": 0.15041162073612213, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 822247632.0, "reward": 0.6573660969734192, "reward_std": 0.1535690277814865, "rewards/simpleverify_reward/mean": 0.6573660969734192, "rewards/simpleverify_reward/std": 0.47485533356666565, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3696.0, "completions/mean_length": 648.8080444335938, "completions/mean_terminated_length": 637.2273559570312, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 11.765014577259475, "grad_norm": 0.1679367870092392, "learning_rate": 1e-06, "loss": 0.0241, "num_tokens": 822920396.0, "reward": 0.6875000596046448, "reward_std": 0.13842588663101196, "rewards/simpleverify_reward/mean": 0.6875, "rewards/simpleverify_reward/std": 0.4637712836265564, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3977.0, "completions/mean_length": 596.3292846679688, "completions/mean_terminated_length": 584.572265625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 11.774344023323614, "grad_norm": 0.14628885686397552, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 823538947.0, "reward": 0.7366071939468384, "reward_std": 0.10641607642173767, "rewards/simpleverify_reward/mean": 0.7366071343421936, "rewards/simpleverify_reward/std": 0.4407196640968323, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 554.921875, "completions/mean_terminated_length": 543.0257568359375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 11.783673469387756, "grad_norm": 0.19412006437778473, "learning_rate": 1e-06, "loss": 0.027, "num_tokens": 824119213.0, "reward": 0.7522321939468384, "reward_std": 0.12869791686534882, "rewards/simpleverify_reward/mean": 0.7522321343421936, "rewards/simpleverify_reward/std": 0.4319573938846588, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2677.0, "completions/mean_length": 585.4140625, "completions/mean_terminated_length": 577.5604248046875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 11.793002915451895, "grad_norm": 0.14683584868907928, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 824730048.0, "reward": 0.7600446939468384, "reward_std": 0.10269410908222198, "rewards/simpleverify_reward/mean": 0.7600446343421936, "rewards/simpleverify_reward/std": 0.42729446291923523, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3196.0, "completions/mean_length": 703.2299194335938, "completions/mean_terminated_length": 684.1908569335938, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 11.802332361516035, "grad_norm": 0.15750521421432495, "learning_rate": 1e-06, "loss": 0.0318, "num_tokens": 825445862.0, "reward": 0.6741071939468384, "reward_std": 0.14320093393325806, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 648.513427734375, "completions/mean_terminated_length": 621.3678588867188, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 11.811661807580174, "grad_norm": 0.163203164935112, "learning_rate": 1e-06, "loss": 0.0273, "num_tokens": 826114010.0, "reward": 0.7566964626312256, "reward_std": 0.13485799729824066, "rewards/simpleverify_reward/mean": 0.7566964030265808, "rewards/simpleverify_reward/std": 0.4293164908885956, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 683.0045166015625, "completions/mean_terminated_length": 667.6995849609375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 11.820991253644316, "grad_norm": 0.14775890111923218, "learning_rate": 1e-06, "loss": 0.029, "num_tokens": 826814646.0, "reward": 0.6986607313156128, "reward_std": 0.11907506734132767, "rewards/simpleverify_reward/mean": 0.6986607313156128, "rewards/simpleverify_reward/std": 0.4590960443019867, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3154.0, "completions/mean_length": 654.3069458007812, "completions/mean_terminated_length": 650.46142578125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 11.830320699708455, "grad_norm": 0.15994858741760254, "learning_rate": 1e-06, "loss": 0.0147, "num_tokens": 827483945.0, "reward": 0.6741071939468384, "reward_std": 0.1436537802219391, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2479.0, "completions/mean_length": 655.5335083007812, "completions/mean_terminated_length": 628.4432373046875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 11.839650145772595, "grad_norm": 0.16131965816020966, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 828163695.0, "reward": 0.7142857313156128, "reward_std": 0.09979952126741409, "rewards/simpleverify_reward/mean": 0.7142857313156128, "rewards/simpleverify_reward/std": 0.4520062506198883, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 612.4933471679688, "completions/mean_terminated_length": 596.8722534179688, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 11.848979591836734, "grad_norm": 0.1703166663646698, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 828815537.0, "reward": 0.6819196939468384, "reward_std": 0.1162225604057312, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3737.0, "completions/mean_length": 603.3828125, "completions/mean_terminated_length": 595.5693359375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 11.858309037900874, "grad_norm": 0.15102063119411469, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 829444136.0, "reward": 0.7098214626312256, "reward_std": 0.12459687143564224, "rewards/simpleverify_reward/mean": 0.7098214030265808, "rewards/simpleverify_reward/std": 0.454098105430603, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 612.8460083007812, "completions/mean_terminated_length": 601.1444702148438, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 11.867638483965015, "grad_norm": 0.18393152952194214, "learning_rate": 1e-06, "loss": 0.0282, "num_tokens": 830084782.0, "reward": 0.715401828289032, "reward_std": 0.1639414280653, "rewards/simpleverify_reward/mean": 0.7154017686843872, "rewards/simpleverify_reward/std": 0.4514748752117157, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3468.0, "completions/mean_length": 619.9654541015625, "completions/mean_terminated_length": 616.08154296875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 11.876967930029155, "grad_norm": 0.1595909744501114, "learning_rate": 1e-06, "loss": 0.0049, "num_tokens": 830736671.0, "reward": 0.738839328289032, "reward_std": 0.12366325408220291, "rewards/simpleverify_reward/mean": 0.7388392686843872, "rewards/simpleverify_reward/std": 0.439512699842453, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3859.0, "completions/mean_length": 657.4296875, "completions/mean_terminated_length": 626.4515991210938, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 11.886297376093294, "grad_norm": 0.17160169780254364, "learning_rate": 1e-06, "loss": 0.0162, "num_tokens": 831420824.0, "reward": 0.7220982313156128, "reward_std": 0.13714639842510223, "rewards/simpleverify_reward/mean": 0.7220982313156128, "rewards/simpleverify_reward/std": 0.44821488857269287, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3610.0, "completions/mean_length": 681.2723388671875, "completions/mean_terminated_length": 665.9596557617188, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 11.895626822157434, "grad_norm": 0.1520000845193863, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 832119924.0, "reward": 0.6707589626312256, "reward_std": 0.13718876242637634, "rewards/simpleverify_reward/mean": 0.6707589030265808, "rewards/simpleverify_reward/std": 0.4702001214027405, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3139.0, "completions/mean_length": 679.4810791015625, "completions/mean_terminated_length": 664.1603393554688, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 11.904956268221575, "grad_norm": 0.1537611037492752, "learning_rate": 1e-06, "loss": 0.0152, "num_tokens": 832818067.0, "reward": 0.6875000596046448, "reward_std": 0.12407512217760086, "rewards/simpleverify_reward/mean": 0.6875, "rewards/simpleverify_reward/std": 0.4637712836265564, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2934.0, "completions/mean_length": 756.4933471679688, "completions/mean_terminated_length": 714.9853515625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 11.914285714285715, "grad_norm": 0.14196813106536865, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 833587061.0, "reward": 0.6930803656578064, "reward_std": 0.13511833548545837, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147334575653076, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2568.0, "completions/mean_length": 669.5546875, "completions/mean_terminated_length": 626.9661254882812, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 11.923615160349854, "grad_norm": 0.1781957596540451, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 834278662.0, "reward": 0.6808035969734192, "reward_std": 0.13876500725746155, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.4664256274700165, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3709.0, "completions/mean_length": 658.3270263671875, "completions/mean_terminated_length": 650.636474609375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 11.932944606413994, "grad_norm": 0.1691417545080185, "learning_rate": 1e-06, "loss": 0.0147, "num_tokens": 834958995.0, "reward": 0.7120535969734192, "reward_std": 0.1485336571931839, "rewards/simpleverify_reward/mean": 0.7120535969734192, "rewards/simpleverify_reward/std": 0.4530589282512665, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3214.0, "completions/mean_length": 688.1763916015625, "completions/mean_terminated_length": 676.7279052734375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 11.942274052478133, "grad_norm": 0.16415482759475708, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 835659617.0, "reward": 0.6908482313156128, "reward_std": 0.14725668728351593, "rewards/simpleverify_reward/mean": 0.6908482313156128, "rewards/simpleverify_reward/std": 0.46240198612213135, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3383.0, "completions/mean_length": 662.3939819335938, "completions/mean_terminated_length": 658.5574951171875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 11.951603498542275, "grad_norm": 0.16586607694625854, "learning_rate": 1e-06, "loss": 0.0159, "num_tokens": 836334738.0, "reward": 0.7053571939468384, "reward_std": 0.13214564323425293, "rewards/simpleverify_reward/mean": 0.7053571343421936, "rewards/simpleverify_reward/std": 0.45613667368888855, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3716.0, "completions/mean_length": 604.9375, "completions/mean_terminated_length": 597.1275024414062, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 11.960932944606414, "grad_norm": 0.2000907063484192, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 836970546.0, "reward": 0.7343750596046448, "reward_std": 0.1374501883983612, "rewards/simpleverify_reward/mean": 0.734375, "rewards/simpleverify_reward/std": 0.44191211462020874, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2457.0, "completions/mean_length": 623.1082763671875, "completions/mean_terminated_length": 615.3389282226562, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 11.970262390670554, "grad_norm": 0.1623074859380722, "learning_rate": 1e-06, "loss": 0.0182, "num_tokens": 837621019.0, "reward": 0.738839328289032, "reward_std": 0.13305744528770447, "rewards/simpleverify_reward/mean": 0.7388392686843872, "rewards/simpleverify_reward/std": 0.439512699842453, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 671.1283569335938, "completions/mean_terminated_length": 663.4664306640625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 11.979591836734693, "grad_norm": 0.14106181263923645, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 838315878.0, "reward": 0.723214328289032, "reward_std": 0.1248999685049057, "rewards/simpleverify_reward/mean": 0.7232142686843872, "rewards/simpleverify_reward/std": 0.44765952229499817, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3349.0, "completions/mean_length": 703.232177734375, "completions/mean_terminated_length": 684.1930541992188, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 11.988921282798835, "grad_norm": 0.13544268906116486, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 839039166.0, "reward": 0.7053571939468384, "reward_std": 0.11257727444171906, "rewards/simpleverify_reward/mean": 0.7053571343421936, "rewards/simpleverify_reward/std": 0.45613667368888855, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 681.15625, "completions/mean_terminated_length": 681.15625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 11.998250728862974, "grad_norm": 0.17351719737052917, "learning_rate": 1e-06, "loss": 0.0121, "num_tokens": 839687095.0, "reward": 0.7176339626312256, "reward_std": 0.15514063835144043, "rewards/simpleverify_reward/mean": 0.7176339030265808, "rewards/simpleverify_reward/std": 0.4504019320011139, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3404.0, "completions/mean_length": 603.138427734375, "completions/mean_terminated_length": 595.3244018554688, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 12.00932944606414, "grad_norm": 0.1576746106147766, "learning_rate": 1e-06, "loss": 0.0131, "num_tokens": 840315003.0, "reward": 0.6852678656578064, "reward_std": 0.12474912405014038, "rewards/simpleverify_reward/mean": 0.6852678656578064, "rewards/simpleverify_reward/std": 0.4646684527397156, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2890.0, "completions/mean_length": 630.0814819335938, "completions/mean_terminated_length": 622.3277587890625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 12.018658892128279, "grad_norm": 0.15008467435836792, "learning_rate": 1e-06, "loss": 0.0227, "num_tokens": 840967636.0, "reward": 0.699776828289032, "reward_std": 0.1049843281507492, "rewards/simpleverify_reward/mean": 0.6997767686843872, "rewards/simpleverify_reward/std": 0.4586109220981598, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2355.0, "completions/mean_length": 613.5770263671875, "completions/mean_terminated_length": 605.786376953125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 12.02798833819242, "grad_norm": 0.16217105090618134, "learning_rate": 1e-06, "loss": 0.0207, "num_tokens": 841608769.0, "reward": 0.7500000596046448, "reward_std": 0.12358655780553818, "rewards/simpleverify_reward/mean": 0.75, "rewards/simpleverify_reward/std": 0.43325456976890564, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2764.0, "completions/mean_length": 641.3761596679688, "completions/mean_terminated_length": 633.6476440429688, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 12.03731778425656, "grad_norm": 0.17897292971611023, "learning_rate": 1e-06, "loss": 0.006, "num_tokens": 842271762.0, "reward": 0.7410714626312256, "reward_std": 0.12862122058868408, "rewards/simpleverify_reward/mean": 0.7410714030265808, "rewards/simpleverify_reward/std": 0.43829095363616943, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 4035.0, "completions/mean_length": 666.6663208007812, "completions/mean_terminated_length": 624.0418090820312, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 12.0466472303207, "grad_norm": 0.15835712850093842, "learning_rate": 1e-06, "loss": 0.0124, "num_tokens": 842947031.0, "reward": 0.7723214626312256, "reward_std": 0.12749217450618744, "rewards/simpleverify_reward/mean": 0.7723214030265808, "rewards/simpleverify_reward/std": 0.41956827044487, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3899.0, "completions/mean_length": 679.9944458007812, "completions/mean_terminated_length": 656.9651489257812, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 12.055976676384839, "grad_norm": 0.17060422897338867, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 843642898.0, "reward": 0.7343750596046448, "reward_std": 0.13301397860050201, "rewards/simpleverify_reward/mean": 0.734375, "rewards/simpleverify_reward/std": 0.44191211462020874, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2425.0, "completions/mean_length": 638.9799194335938, "completions/mean_terminated_length": 623.4776000976562, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 12.06530612244898, "grad_norm": 0.13353446125984192, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 844306032.0, "reward": 0.684151828289032, "reward_std": 0.09468747675418854, "rewards/simpleverify_reward/mean": 0.6841517686843872, "rewards/simpleverify_reward/std": 0.4651124179363251, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3977.0, "completions/mean_length": 653.0792846679688, "completions/mean_terminated_length": 629.8685302734375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 12.07463556851312, "grad_norm": 0.18242429196834564, "learning_rate": 1e-06, "loss": 0.02, "num_tokens": 844985727.0, "reward": 0.7321428656578064, "reward_std": 0.1332082897424698, "rewards/simpleverify_reward/mean": 0.7321428656578064, "rewards/simpleverify_reward/std": 0.4430900514125824, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 702.9766235351562, "completions/mean_terminated_length": 683.93603515625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 12.08396501457726, "grad_norm": 0.1482776254415512, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 845705850.0, "reward": 0.7332589626312256, "reward_std": 0.11509208381175995, "rewards/simpleverify_reward/mean": 0.7332589030265808, "rewards/simpleverify_reward/std": 0.4425028860569, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3712.0, "completions/mean_length": 676.5234375, "completions/mean_terminated_length": 649.5984497070312, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 12.093294460641399, "grad_norm": 0.17344637215137482, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 846406463.0, "reward": 0.7042410969734192, "reward_std": 0.14718110859394073, "rewards/simpleverify_reward/mean": 0.7042410969734192, "rewards/simpleverify_reward/std": 0.45663803815841675, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3153.0, "completions/mean_length": 700.5223388671875, "completions/mean_terminated_length": 677.6314697265625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 12.102623906705539, "grad_norm": 0.1272251456975937, "learning_rate": 1e-06, "loss": 0.016, "num_tokens": 847126835.0, "reward": 0.6941964626312256, "reward_std": 0.10765279084444046, "rewards/simpleverify_reward/mean": 0.6941964030265808, "rewards/simpleverify_reward/std": 0.4610042870044708, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2132.0, "completions/mean_length": 605.3035888671875, "completions/mean_terminated_length": 597.494384765625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 12.11195335276968, "grad_norm": 0.16220137476921082, "learning_rate": 1e-06, "loss": 0.024, "num_tokens": 847752267.0, "reward": 0.7254464626312256, "reward_std": 0.11847773939371109, "rewards/simpleverify_reward/mean": 0.7254464030265808, "rewards/simpleverify_reward/std": 0.4465382993221283, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2721.0, "completions/max_terminated_length": 2721.0, "completions/mean_length": 657.1506958007812, "completions/mean_terminated_length": 657.1506958007812, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 12.12128279883382, "grad_norm": 0.12478906661272049, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 848427178.0, "reward": 0.7645089626312256, "reward_std": 0.10769416391849518, "rewards/simpleverify_reward/mean": 0.7645089030265808, "rewards/simpleverify_reward/std": 0.42454230785369873, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0189732142857143, "completions/max_length": 4096.0, "completions/max_terminated_length": 3351.0, "completions/mean_length": 745.2266235351562, "completions/mean_terminated_length": 680.4220581054688, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 12.130612244897959, "grad_norm": 0.16947218775749207, "learning_rate": 1e-06, "loss": 0.0224, "num_tokens": 849188437.0, "reward": 0.6629464626312256, "reward_std": 0.11483316123485565, "rewards/simpleverify_reward/mean": 0.6629464030265808, "rewards/simpleverify_reward/std": 0.47296738624572754, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3543.0, "completions/mean_length": 627.9074096679688, "completions/mean_terminated_length": 616.2564697265625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 12.139941690962099, "grad_norm": 0.16114696860313416, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 849836362.0, "reward": 0.7399553656578064, "reward_std": 0.11963960528373718, "rewards/simpleverify_reward/mean": 0.7399553656578064, "rewards/simpleverify_reward/std": 0.43890365958213806, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3755.0, "completions/mean_length": 752.8281860351562, "completions/mean_terminated_length": 711.2745971679688, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 12.14927113702624, "grad_norm": 0.16589006781578064, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 850610888.0, "reward": 0.7098214626312256, "reward_std": 0.1328299343585968, "rewards/simpleverify_reward/mean": 0.7098214030265808, "rewards/simpleverify_reward/std": 0.454098105430603, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3097.0, "completions/max_terminated_length": 3097.0, "completions/mean_length": 573.2611694335938, "completions/mean_terminated_length": 573.2611694335938, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 12.15860058309038, "grad_norm": 0.17080067098140717, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 851209058.0, "reward": 0.7723214626312256, "reward_std": 0.10626383125782013, "rewards/simpleverify_reward/mean": 0.7723214030265808, "rewards/simpleverify_reward/std": 0.4195682406425476, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3666.0, "completions/mean_length": 675.9732666015625, "completions/mean_terminated_length": 672.1519165039062, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 12.167930029154519, "grad_norm": 0.1695433109998703, "learning_rate": 1e-06, "loss": 0.0196, "num_tokens": 851905170.0, "reward": 0.7042410969734192, "reward_std": 0.1328292191028595, "rewards/simpleverify_reward/mean": 0.7042410969734192, "rewards/simpleverify_reward/std": 0.45663803815841675, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2383.0, "completions/mean_length": 666.5390625, "completions/mean_terminated_length": 631.7418212890625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 12.177259475218658, "grad_norm": 0.15784358978271484, "learning_rate": 1e-06, "loss": 0.0438, "num_tokens": 852585101.0, "reward": 0.7611607313156128, "reward_std": 0.12189590930938721, "rewards/simpleverify_reward/mean": 0.7611607313156128, "rewards/simpleverify_reward/std": 0.4266124963760376, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2550.0, "completions/mean_length": 600.03125, "completions/mean_terminated_length": 592.2102661132812, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 12.186588921282798, "grad_norm": 0.14486075937747955, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 853202409.0, "reward": 0.7678571939468384, "reward_std": 0.08901664614677429, "rewards/simpleverify_reward/mean": 0.7678571343421936, "rewards/simpleverify_reward/std": 0.422435462474823, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3364.0, "completions/mean_length": 689.3058471679688, "completions/mean_terminated_length": 670.1885986328125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 12.19591836734694, "grad_norm": 0.13932029902935028, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 853910723.0, "reward": 0.6941964626312256, "reward_std": 0.10746845602989197, "rewards/simpleverify_reward/mean": 0.6941964030265808, "rewards/simpleverify_reward/std": 0.46100425720214844, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3711.0, "completions/mean_length": 637.5859375, "completions/mean_terminated_length": 614.2708129882812, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 12.205247813411079, "grad_norm": 0.1538495123386383, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 854568368.0, "reward": 0.7187500596046448, "reward_std": 0.10900826752185822, "rewards/simpleverify_reward/mean": 0.71875, "rewards/simpleverify_reward/std": 0.4498603343963623, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3498.0, "completions/mean_length": 719.2902221679688, "completions/mean_terminated_length": 692.7019653320312, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 12.214577259475218, "grad_norm": 0.15997692942619324, "learning_rate": 1e-06, "loss": 0.0316, "num_tokens": 855301500.0, "reward": 0.7176339626312256, "reward_std": 0.11652565747499466, "rewards/simpleverify_reward/mean": 0.7176339030265808, "rewards/simpleverify_reward/std": 0.4504019320011139, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3658.0, "completions/mean_length": 719.7600708007812, "completions/mean_terminated_length": 681.6535034179688, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 12.223906705539358, "grad_norm": 0.1493740677833557, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 856032293.0, "reward": 0.6774553656578064, "reward_std": 0.11434531956911087, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3089.0, "completions/mean_length": 720.1629638671875, "completions/mean_terminated_length": 685.9097900390625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 12.2332361516035, "grad_norm": 0.1525895595550537, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 856764367.0, "reward": 0.6517857313156128, "reward_std": 0.12636421620845795, "rewards/simpleverify_reward/mean": 0.6517857313156128, "rewards/simpleverify_reward/std": 0.47667041420936584, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 677.2567138671875, "completions/mean_terminated_length": 658.0718383789062, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 12.242565597667639, "grad_norm": 0.1543329507112503, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 857458461.0, "reward": 0.7287946939468384, "reward_std": 0.12633031606674194, "rewards/simpleverify_reward/mean": 0.7287946343421936, "rewards/simpleverify_reward/std": 0.44483017921447754, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3304.0, "completions/mean_length": 625.765625, "completions/mean_terminated_length": 614.1075439453125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 12.251895043731778, "grad_norm": 0.1657327264547348, "learning_rate": 1e-06, "loss": 0.0281, "num_tokens": 858099491.0, "reward": 0.7477678656578064, "reward_std": 0.11794712394475937, "rewards/simpleverify_reward/mean": 0.7477678656578064, "rewards/simpleverify_reward/std": 0.434536337852478, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3914.0, "completions/mean_length": 727.8672485351562, "completions/mean_terminated_length": 701.3464965820312, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 12.261224489795918, "grad_norm": 0.16353529691696167, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 858844004.0, "reward": 0.6774553656578064, "reward_std": 0.15724819898605347, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 4020.0, "completions/mean_length": 751.0714721679688, "completions/mean_terminated_length": 705.6651611328125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 12.270553935860057, "grad_norm": 0.12865863740444183, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 859602588.0, "reward": 0.699776828289032, "reward_std": 0.08901593834161758, "rewards/simpleverify_reward/mean": 0.6997767686843872, "rewards/simpleverify_reward/std": 0.4586109220981598, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 626.9921875, "completions/mean_terminated_length": 615.3381958007812, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 12.279883381924199, "grad_norm": 0.17844611406326294, "learning_rate": 1e-06, "loss": 0.0368, "num_tokens": 860255405.0, "reward": 0.7276785969734192, "reward_std": 0.13083365559577942, "rewards/simpleverify_reward/mean": 0.7276785969734192, "rewards/simpleverify_reward/std": 0.4454030692577362, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 594.575927734375, "completions/mean_terminated_length": 555.0564575195312, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 12.289212827988338, "grad_norm": 0.16686268150806427, "learning_rate": 1e-06, "loss": 0.0365, "num_tokens": 860868361.0, "reward": 0.7600446939468384, "reward_std": 0.12677791714668274, "rewards/simpleverify_reward/mean": 0.7600446343421936, "rewards/simpleverify_reward/std": 0.42729446291923523, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3356.0, "completions/mean_length": 651.5949096679688, "completions/mean_terminated_length": 632.2660522460938, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 12.298542274052478, "grad_norm": 0.1832900494337082, "learning_rate": 1e-06, "loss": 0.0232, "num_tokens": 861544862.0, "reward": 0.7109375596046448, "reward_std": 0.12084423005580902, "rewards/simpleverify_reward/mean": 0.7109375, "rewards/simpleverify_reward/std": 0.45358020067214966, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3796.0, "completions/mean_length": 633.5692138671875, "completions/mean_terminated_length": 629.7005615234375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 12.307871720116617, "grad_norm": 0.14915356040000916, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 862199684.0, "reward": 0.7444196939468384, "reward_std": 0.10118637979030609, "rewards/simpleverify_reward/mean": 0.7444196343421936, "rewards/simpleverify_reward/std": 0.43643057346343994, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3779.0, "completions/mean_length": 733.2076416015625, "completions/mean_terminated_length": 718.1278686523438, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 12.317201166180759, "grad_norm": 0.1696404069662094, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 862956190.0, "reward": 0.6729910969734192, "reward_std": 0.14263640344142914, "rewards/simpleverify_reward/mean": 0.6729910969734192, "rewards/simpleverify_reward/std": 0.46938255429267883, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3437.0, "completions/mean_length": 636.625, "completions/mean_terminated_length": 617.212158203125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 12.326530612244898, "grad_norm": 0.1721809357404709, "learning_rate": 1e-06, "loss": 0.024, "num_tokens": 863609894.0, "reward": 0.7533482313156128, "reward_std": 0.1205386146903038, "rewards/simpleverify_reward/mean": 0.7533482313156128, "rewards/simpleverify_reward/std": 0.4313030242919922, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3930.0, "completions/mean_length": 733.2767944335938, "completions/mean_terminated_length": 706.7987060546875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 12.335860058309038, "grad_norm": 0.13640883564949036, "learning_rate": 1e-06, "loss": 0.0045, "num_tokens": 864353310.0, "reward": 0.6941964626312256, "reward_std": 0.10577667504549026, "rewards/simpleverify_reward/mean": 0.6941964030265808, "rewards/simpleverify_reward/std": 0.4610042870044708, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 4025.0, "completions/mean_length": 757.7756958007812, "completions/mean_terminated_length": 723.9041137695312, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 12.345189504373177, "grad_norm": 0.15329498052597046, "learning_rate": 1e-06, "loss": 0.0155, "num_tokens": 865116901.0, "reward": 0.6662946939468384, "reward_std": 0.12200286239385605, "rewards/simpleverify_reward/mean": 0.6662946343421936, "rewards/simpleverify_reward/std": 0.47179925441741943, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3897.0, "completions/mean_length": 694.7142944335938, "completions/mean_terminated_length": 656.3250732421875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 12.354518950437317, "grad_norm": 0.15375149250030518, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 865826173.0, "reward": 0.7109375596046448, "reward_std": 0.09412476420402527, "rewards/simpleverify_reward/mean": 0.7109375, "rewards/simpleverify_reward/std": 0.45358020067214966, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3472.0, "completions/mean_length": 676.7779541015625, "completions/mean_terminated_length": 653.7269897460938, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 12.363848396501458, "grad_norm": 0.14024432003498077, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 866519846.0, "reward": 0.6986607313156128, "reward_std": 0.10280217975378036, "rewards/simpleverify_reward/mean": 0.6986607313156128, "rewards/simpleverify_reward/std": 0.4590960443019867, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4072.0, "completions/mean_length": 729.9676513671875, "completions/mean_terminated_length": 703.4635009765625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 12.373177842565598, "grad_norm": 0.15041975677013397, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 867269993.0, "reward": 0.6774553656578064, "reward_std": 0.14252763986587524, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2429.0, "completions/mean_length": 751.1295166015625, "completions/mean_terminated_length": 732.3591918945312, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 12.382507288629737, "grad_norm": 0.16192053258419037, "learning_rate": 1e-06, "loss": 0.0222, "num_tokens": 868028797.0, "reward": 0.6774553656578064, "reward_std": 0.14564046263694763, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3644.0, "completions/mean_length": 848.1719360351562, "completions/mean_terminated_length": 770.2239990234375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 12.391836734693877, "grad_norm": 0.13543808460235596, "learning_rate": 1e-06, "loss": 0.0402, "num_tokens": 868881719.0, "reward": 0.6238839626312256, "reward_std": 0.1374162882566452, "rewards/simpleverify_reward/mean": 0.6238839030265808, "rewards/simpleverify_reward/std": 0.48468026518821716, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3939.0, "completions/mean_length": 674.875, "completions/mean_terminated_length": 659.53369140625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 12.401166180758018, "grad_norm": 0.16163037717342377, "learning_rate": 1e-06, "loss": 0.0338, "num_tokens": 869565031.0, "reward": 0.7020089626312256, "reward_std": 0.12362048029899597, "rewards/simpleverify_reward/mean": 0.7020089030265808, "rewards/simpleverify_reward/std": 0.45763099193573, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3521.0, "completions/mean_length": 676.021240234375, "completions/mean_terminated_length": 649.09228515625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 12.410495626822158, "grad_norm": 0.15974025428295135, "learning_rate": 1e-06, "loss": 0.0208, "num_tokens": 870259938.0, "reward": 0.7399553656578064, "reward_std": 0.14658519625663757, "rewards/simpleverify_reward/mean": 0.7399553656578064, "rewards/simpleverify_reward/std": 0.43890365958213806, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 601.9408569335938, "completions/mean_terminated_length": 590.2026977539062, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 12.419825072886297, "grad_norm": 0.13089649379253387, "learning_rate": 1e-06, "loss": 0.0283, "num_tokens": 870889461.0, "reward": 0.738839328289032, "reward_std": 0.09833527356386185, "rewards/simpleverify_reward/mean": 0.7388392686843872, "rewards/simpleverify_reward/std": 0.439512699842453, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 652.0558471679688, "completions/mean_terminated_length": 648.2078247070312, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 12.429154518950437, "grad_norm": 0.17474883794784546, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 871574807.0, "reward": 0.7209821939468384, "reward_std": 0.1449248194694519, "rewards/simpleverify_reward/mean": 0.7209821343421936, "rewards/simpleverify_reward/std": 0.448766827583313, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3241.0, "completions/mean_length": 671.828125, "completions/mean_terminated_length": 660.3247680664062, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 12.438483965014576, "grad_norm": 0.15115238726139069, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 872264317.0, "reward": 0.7500000596046448, "reward_std": 0.13549739122390747, "rewards/simpleverify_reward/mean": 0.75, "rewards/simpleverify_reward/std": 0.43325456976890564, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2640.0, "completions/mean_length": 582.0011596679688, "completions/mean_terminated_length": 574.1398315429688, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 12.447813411078718, "grad_norm": 0.1933545023202896, "learning_rate": 1e-06, "loss": 0.0216, "num_tokens": 872871814.0, "reward": 0.793526828289032, "reward_std": 0.16003583371639252, "rewards/simpleverify_reward/mean": 0.7935267686843872, "rewards/simpleverify_reward/std": 0.40500015020370483, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 700.232177734375, "completions/mean_terminated_length": 688.82421875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 12.457142857142857, "grad_norm": 0.15569929778575897, "learning_rate": 1e-06, "loss": 0.031, "num_tokens": 873600222.0, "reward": 0.6785714626312256, "reward_std": 0.1140415221452713, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3172.0, "completions/mean_length": 671.7120971679688, "completions/mean_terminated_length": 648.626953125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 12.466472303206997, "grad_norm": 0.17193061113357544, "learning_rate": 1e-06, "loss": 0.0327, "num_tokens": 874294140.0, "reward": 0.7142857313156128, "reward_std": 0.14048857986927032, "rewards/simpleverify_reward/mean": 0.7142857313156128, "rewards/simpleverify_reward/std": 0.4520062506198883, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2866.0, "completions/mean_length": 586.1205444335938, "completions/mean_terminated_length": 570.3811645507812, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 12.475801749271136, "grad_norm": 0.16058114171028137, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 874899920.0, "reward": 0.7767857313156128, "reward_std": 0.0968710258603096, "rewards/simpleverify_reward/mean": 0.7767857313156128, "rewards/simpleverify_reward/std": 0.41663336753845215, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4035.0, "completions/max_terminated_length": 4035.0, "completions/mean_length": 668.9609375, "completions/mean_terminated_length": 668.9609375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 12.485131195335278, "grad_norm": 0.17387399077415466, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 875594077.0, "reward": 0.7533482313156128, "reward_std": 0.14162608981132507, "rewards/simpleverify_reward/mean": 0.7533482313156128, "rewards/simpleverify_reward/std": 0.4313030242919922, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3405.0, "completions/mean_length": 661.8984375, "completions/mean_terminated_length": 623.1388549804688, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 12.494460641399417, "grad_norm": 0.1662472039461136, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 876279490.0, "reward": 0.7488839626312256, "reward_std": 0.13542000949382782, "rewards/simpleverify_reward/mean": 0.7488839030265808, "rewards/simpleverify_reward/std": 0.43389734625816345, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 696.4531860351562, "completions/mean_terminated_length": 673.5348510742188, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 12.503790087463557, "grad_norm": 0.22965191304683685, "learning_rate": 1e-06, "loss": 0.0314, "num_tokens": 876993288.0, "reward": 0.7008928656578064, "reward_std": 0.13711389899253845, "rewards/simpleverify_reward/mean": 0.7008928656578064, "rewards/simpleverify_reward/std": 0.4581226110458374, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3368.0, "completions/mean_length": 733.4285888671875, "completions/mean_terminated_length": 699.3099975585938, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 12.513119533527696, "grad_norm": 0.1384902149438858, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 877751408.0, "reward": 0.6651785969734192, "reward_std": 0.11208872497081757, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219157218933105, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3786.0, "completions/mean_length": 698.4219360351562, "completions/mean_terminated_length": 679.3558349609375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 12.522448979591836, "grad_norm": 0.16897131502628326, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 878467162.0, "reward": 0.7287946939468384, "reward_std": 0.15135519206523895, "rewards/simpleverify_reward/mean": 0.7287946343421936, "rewards/simpleverify_reward/std": 0.44483017921447754, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2875.0, "completions/mean_length": 675.9955444335938, "completions/mean_terminated_length": 645.1846923828125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 12.531778425655977, "grad_norm": 0.1549491584300995, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 879156150.0, "reward": 0.7020089626312256, "reward_std": 0.11419306695461273, "rewards/simpleverify_reward/mean": 0.7020089030265808, "rewards/simpleverify_reward/std": 0.45763102173805237, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2407.0, "completions/mean_length": 686.4241333007812, "completions/mean_terminated_length": 651.82861328125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 12.541107871720117, "grad_norm": 0.1449332982301712, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 879859818.0, "reward": 0.6662946939468384, "reward_std": 0.13139818608760834, "rewards/simpleverify_reward/mean": 0.6662946343421936, "rewards/simpleverify_reward/std": 0.47179925441741943, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 661.0379638671875, "completions/mean_terminated_length": 649.4983520507812, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 12.550437317784256, "grad_norm": 0.17148959636688232, "learning_rate": 1e-06, "loss": 0.0241, "num_tokens": 880534556.0, "reward": 0.7120535969734192, "reward_std": 0.1349014788866043, "rewards/simpleverify_reward/mean": 0.7120535969734192, "rewards/simpleverify_reward/std": 0.4530589282512665, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3183.0, "completions/mean_length": 592.1707763671875, "completions/mean_terminated_length": 588.255859375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 12.559766763848396, "grad_norm": 0.1510639637708664, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 881145605.0, "reward": 0.754464328289032, "reward_std": 0.11513626575469971, "rewards/simpleverify_reward/mean": 0.7544642686843872, "rewards/simpleverify_reward/std": 0.4306447505950928, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2819.0, "completions/mean_length": 656.7600708007812, "completions/mean_terminated_length": 645.2060546875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 12.569096209912537, "grad_norm": 0.18001866340637207, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 881827470.0, "reward": 0.6953125596046448, "reward_std": 0.1483079493045807, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 617.138427734375, "completions/mean_terminated_length": 601.5381469726562, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 12.578425655976677, "grad_norm": 0.15336672961711884, "learning_rate": 1e-06, "loss": 0.0298, "num_tokens": 882474738.0, "reward": 0.7566964626312256, "reward_std": 0.10634050518274307, "rewards/simpleverify_reward/mean": 0.7566964030265808, "rewards/simpleverify_reward/std": 0.4293164908885956, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3158.0, "completions/mean_length": 648.130615234375, "completions/mean_terminated_length": 620.9820556640625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 12.587755102040816, "grad_norm": 0.17810018360614777, "learning_rate": 1e-06, "loss": 0.0298, "num_tokens": 883145791.0, "reward": 0.7176339626312256, "reward_std": 0.1519550383090973, "rewards/simpleverify_reward/mean": 0.7176339030265808, "rewards/simpleverify_reward/std": 0.4504019320011139, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 636.5926513671875, "completions/mean_terminated_length": 613.2708129882812, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 12.597084548104956, "grad_norm": 0.15244971215724945, "learning_rate": 1e-06, "loss": 0.0352, "num_tokens": 883804138.0, "reward": 0.7410714626312256, "reward_std": 0.11265283823013306, "rewards/simpleverify_reward/mean": 0.7410714030265808, "rewards/simpleverify_reward/std": 0.43829095363616943, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3724.0, "completions/mean_length": 694.239990234375, "completions/mean_terminated_length": 667.4544677734375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 12.606413994169095, "grad_norm": 0.14549970626831055, "learning_rate": 1e-06, "loss": 0.0076, "num_tokens": 884524305.0, "reward": 0.6674107313156128, "reward_std": 0.12102898210287094, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3812.0, "completions/mean_length": 588.859375, "completions/mean_terminated_length": 584.9407348632812, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 12.615743440233237, "grad_norm": 0.18574932217597961, "learning_rate": 1e-06, "loss": 0.0202, "num_tokens": 885145291.0, "reward": 0.7220982313156128, "reward_std": 0.16889801621437073, "rewards/simpleverify_reward/mean": 0.7220982313156128, "rewards/simpleverify_reward/std": 0.44821491837501526, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2714.0, "completions/mean_length": 667.8671875, "completions/mean_terminated_length": 644.7561645507812, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 12.625072886297376, "grad_norm": 0.1693604737520218, "learning_rate": 1e-06, "loss": 0.0188, "num_tokens": 885833196.0, "reward": 0.6796875596046448, "reward_std": 0.13474993407726288, "rewards/simpleverify_reward/mean": 0.6796875, "rewards/simpleverify_reward/std": 0.4668572247028351, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3956.0, "completions/mean_length": 697.4107666015625, "completions/mean_terminated_length": 651.2760620117188, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 12.634402332361516, "grad_norm": 0.15090271830558777, "learning_rate": 1e-06, "loss": 0.0311, "num_tokens": 886545988.0, "reward": 0.676339328289032, "reward_std": 0.12569020688533783, "rewards/simpleverify_reward/mean": 0.6763392686843872, "rewards/simpleverify_reward/std": 0.4681335985660553, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3836.0, "completions/mean_length": 677.4788208007812, "completions/mean_terminated_length": 650.5613403320312, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 12.643731778425655, "grad_norm": 0.15127409994602203, "learning_rate": 1e-06, "loss": 0.017, "num_tokens": 887247041.0, "reward": 0.7020089626312256, "reward_std": 0.11971516907215118, "rewards/simpleverify_reward/mean": 0.7020089030265808, "rewards/simpleverify_reward/std": 0.45763099193573, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3173.0, "completions/mean_length": 578.8560791015625, "completions/mean_terminated_length": 551.1619873046875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 12.653061224489797, "grad_norm": 0.10925968736410141, "learning_rate": 1e-06, "loss": 0.0053, "num_tokens": 887843032.0, "reward": 0.762276828289032, "reward_std": 0.05978236719965935, "rewards/simpleverify_reward/mean": 0.7622767686843872, "rewards/simpleverify_reward/std": 0.42592647671699524, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3981.0, "completions/mean_length": 654.5078125, "completions/mean_terminated_length": 646.8087158203125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 12.662390670553936, "grad_norm": 0.159852996468544, "learning_rate": 1e-06, "loss": 0.0149, "num_tokens": 888522631.0, "reward": 0.7176339626312256, "reward_std": 0.13842658698558807, "rewards/simpleverify_reward/mean": 0.7176339030265808, "rewards/simpleverify_reward/std": 0.4504019320011139, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 695.1842041015625, "completions/mean_terminated_length": 652.9141235351562, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 12.671720116618076, "grad_norm": 0.15287722647190094, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 889247188.0, "reward": 0.6729910969734192, "reward_std": 0.14053276181221008, "rewards/simpleverify_reward/mean": 0.6729910969734192, "rewards/simpleverify_reward/std": 0.4693825840950012, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3776.0, "completions/mean_length": 713.8270263671875, "completions/mean_terminated_length": 683.3569946289062, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 12.681049562682215, "grad_norm": 0.1489296555519104, "learning_rate": 1e-06, "loss": 0.0361, "num_tokens": 889973561.0, "reward": 0.691964328289032, "reward_std": 0.13264447450637817, "rewards/simpleverify_reward/mean": 0.6919642686843872, "rewards/simpleverify_reward/std": 0.4619392454624176, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 593.734375, "completions/mean_terminated_length": 581.9686889648438, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 12.690379008746355, "grad_norm": 0.17499499022960663, "learning_rate": 1e-06, "loss": 0.0147, "num_tokens": 890591779.0, "reward": 0.7678571939468384, "reward_std": 0.11910827457904816, "rewards/simpleverify_reward/mean": 0.7678571343421936, "rewards/simpleverify_reward/std": 0.422435462474823, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2969.0, "completions/mean_length": 603.8717041015625, "completions/mean_terminated_length": 588.2119140625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 12.699708454810496, "grad_norm": 0.16927321255207062, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 891222904.0, "reward": 0.7801339626312256, "reward_std": 0.13293910026550293, "rewards/simpleverify_reward/mean": 0.7801339030265808, "rewards/simpleverify_reward/std": 0.4143870770931244, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2709.0, "completions/mean_length": 685.9207763671875, "completions/mean_terminated_length": 659.0697631835938, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 12.709037900874636, "grad_norm": 0.16775479912757874, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 891920769.0, "reward": 0.6830357313156128, "reward_std": 0.15999193489551544, "rewards/simpleverify_reward/mean": 0.6830357313156128, "rewards/simpleverify_reward/std": 0.46555325388908386, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3705.0, "completions/mean_length": 620.2723388671875, "completions/mean_terminated_length": 616.3887939453125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 12.718367346938775, "grad_norm": 0.16153395175933838, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 892557829.0, "reward": 0.7131696939468384, "reward_std": 0.12974846363067627, "rewards/simpleverify_reward/mean": 0.7131696343421936, "rewards/simpleverify_reward/std": 0.4525342881679535, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 715.7109985351562, "completions/mean_terminated_length": 696.7418823242188, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 12.727696793002915, "grad_norm": 0.1673622876405716, "learning_rate": 1e-06, "loss": 0.0097, "num_tokens": 893290370.0, "reward": 0.582589328289032, "reward_std": 0.1409093141555786, "rewards/simpleverify_reward/mean": 0.5825892686843872, "rewards/simpleverify_reward/std": 0.4934072494506836, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3267.0, "completions/mean_length": 687.6395263671875, "completions/mean_terminated_length": 680.0145263671875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 12.737026239067056, "grad_norm": 0.17473258078098297, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 893999815.0, "reward": 0.6640625, "reward_std": 0.1602187603712082, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 646.4074096679688, "completions/mean_terminated_length": 638.690185546875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 12.746355685131196, "grad_norm": 0.17025722563266754, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 894680268.0, "reward": 0.6975446939468384, "reward_std": 0.13455308973789215, "rewards/simpleverify_reward/mean": 0.6975446343421936, "rewards/simpleverify_reward/std": 0.45957788825035095, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2422.0, "completions/mean_length": 613.4598388671875, "completions/mean_terminated_length": 597.8430786132812, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 12.755685131195335, "grad_norm": 0.15506497025489807, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 895310008.0, "reward": 0.7500000596046448, "reward_std": 0.11381400376558304, "rewards/simpleverify_reward/mean": 0.75, "rewards/simpleverify_reward/std": 0.43325456976890564, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 695.7723388671875, "completions/mean_terminated_length": 653.5096435546875, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 12.765014577259475, "grad_norm": 0.14836110174655914, "learning_rate": 1e-06, "loss": 0.0226, "num_tokens": 896023876.0, "reward": 0.6875000596046448, "reward_std": 0.10231363028287888, "rewards/simpleverify_reward/mean": 0.6875, "rewards/simpleverify_reward/std": 0.4637712836265564, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4070.0, "completions/mean_length": 672.09375, "completions/mean_terminated_length": 641.2477416992188, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 12.774344023323614, "grad_norm": 0.18558120727539062, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 896712160.0, "reward": 0.6863839626312256, "reward_std": 0.12253489345312119, "rewards/simpleverify_reward/mean": 0.6863839030265808, "rewards/simpleverify_reward/std": 0.46422144770622253, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3588.0, "completions/mean_length": 605.6808471679688, "completions/mean_terminated_length": 590.0291748046875, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 12.783673469387756, "grad_norm": 0.15163227915763855, "learning_rate": 1e-06, "loss": 0.018, "num_tokens": 897340994.0, "reward": 0.7265625596046448, "reward_std": 0.12099508196115494, "rewards/simpleverify_reward/mean": 0.7265625, "rewards/simpleverify_reward/std": 0.4459724426269531, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3122.0, "completions/mean_length": 663.6707763671875, "completions/mean_terminated_length": 652.1400146484375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 12.793002915451895, "grad_norm": 0.15570123493671417, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 898037451.0, "reward": 0.7187500596046448, "reward_std": 0.14225731790065765, "rewards/simpleverify_reward/mean": 0.71875, "rewards/simpleverify_reward/std": 0.4498603343963623, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3324.0, "completions/mean_length": 654.8873291015625, "completions/mean_terminated_length": 623.8862915039062, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 12.802332361516035, "grad_norm": 0.16584311425685883, "learning_rate": 1e-06, "loss": 0.0115, "num_tokens": 898714990.0, "reward": 0.7511160969734192, "reward_std": 0.12779708206653595, "rewards/simpleverify_reward/mean": 0.7511160969734192, "rewards/simpleverify_reward/std": 0.43260788917541504, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3803.0, "completions/mean_length": 700.7098388671875, "completions/mean_terminated_length": 673.9752807617188, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 12.811661807580174, "grad_norm": 0.17206734418869019, "learning_rate": 1e-06, "loss": 0.0293, "num_tokens": 899445834.0, "reward": 0.6417410969734192, "reward_std": 0.13981667160987854, "rewards/simpleverify_reward/mean": 0.6417410969734192, "rewards/simpleverify_reward/std": 0.47975656390190125, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 700.8114013671875, "completions/mean_terminated_length": 662.490966796875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 12.820991253644316, "grad_norm": 0.14822053909301758, "learning_rate": 1e-06, "loss": 0.0124, "num_tokens": 900167161.0, "reward": 0.6540178656578064, "reward_std": 0.11314251273870468, "rewards/simpleverify_reward/mean": 0.6540178656578064, "rewards/simpleverify_reward/std": 0.4759531021118164, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3680.0, "completions/mean_length": 734.247802734375, "completions/mean_terminated_length": 703.9617309570312, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 12.830320699708455, "grad_norm": 0.14357078075408936, "learning_rate": 1e-06, "loss": 0.0191, "num_tokens": 900921759.0, "reward": 0.7053571939468384, "reward_std": 0.10667569190263748, "rewards/simpleverify_reward/mean": 0.7053571343421936, "rewards/simpleverify_reward/std": 0.45613667368888855, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3116.0, "completions/mean_length": 656.1796875, "completions/mean_terminated_length": 629.0945434570312, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 12.839650145772595, "grad_norm": 0.14599576592445374, "learning_rate": 1e-06, "loss": 0.0159, "num_tokens": 901596712.0, "reward": 0.7120535969734192, "reward_std": 0.12531183660030365, "rewards/simpleverify_reward/mean": 0.7120535969734192, "rewards/simpleverify_reward/std": 0.4530589282512665, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2864.0, "completions/mean_length": 620.3303833007812, "completions/mean_terminated_length": 612.5548095703125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 12.848979591836734, "grad_norm": 0.16541419923305511, "learning_rate": 1e-06, "loss": 0.0282, "num_tokens": 902244512.0, "reward": 0.7533482313156128, "reward_std": 0.11719966679811478, "rewards/simpleverify_reward/mean": 0.7533482313156128, "rewards/simpleverify_reward/std": 0.4313029944896698, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3239.0, "completions/mean_length": 685.4553833007812, "completions/mean_terminated_length": 654.729736328125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 12.858309037900874, "grad_norm": 0.16653281450271606, "learning_rate": 1e-06, "loss": 0.0324, "num_tokens": 902946720.0, "reward": 0.6930803656578064, "reward_std": 0.14496758580207825, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147334575653076, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4032.0, "completions/mean_length": 698.9721069335938, "completions/mean_terminated_length": 676.07080078125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 12.867638483965015, "grad_norm": 0.15761399269104004, "learning_rate": 1e-06, "loss": 0.0218, "num_tokens": 903657983.0, "reward": 0.6975446939468384, "reward_std": 0.1288815438747406, "rewards/simpleverify_reward/mean": 0.6975446343421936, "rewards/simpleverify_reward/std": 0.45957791805267334, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3811.0, "completions/mean_length": 739.3002319335938, "completions/mean_terminated_length": 731.7908325195312, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 12.876967930029155, "grad_norm": 0.13531531393527985, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 904423452.0, "reward": 0.6104910969734192, "reward_std": 0.11835829168558121, "rewards/simpleverify_reward/mean": 0.6104910969734192, "rewards/simpleverify_reward/std": 0.48791128396987915, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 634.771240234375, "completions/mean_terminated_length": 623.1433715820312, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 12.886297376093294, "grad_norm": 0.14979025721549988, "learning_rate": 1e-06, "loss": 0.0262, "num_tokens": 905089199.0, "reward": 0.7187500596046448, "reward_std": 0.11393275856971741, "rewards/simpleverify_reward/mean": 0.71875, "rewards/simpleverify_reward/std": 0.4498603343963623, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2954.0, "completions/mean_length": 692.1596069335938, "completions/mean_terminated_length": 684.5447387695312, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 12.895626822157434, "grad_norm": 0.15881288051605225, "learning_rate": 1e-06, "loss": 0.029, "num_tokens": 905795862.0, "reward": 0.699776828289032, "reward_std": 0.1235131099820137, "rewards/simpleverify_reward/mean": 0.6997767686843872, "rewards/simpleverify_reward/std": 0.4586109220981598, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3285.0, "completions/mean_length": 706.5748291015625, "completions/mean_terminated_length": 679.8864135742188, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 12.904956268221575, "grad_norm": 0.1591111421585083, "learning_rate": 1e-06, "loss": 0.0343, "num_tokens": 906526465.0, "reward": 0.7120535969734192, "reward_std": 0.13121342658996582, "rewards/simpleverify_reward/mean": 0.7120535969734192, "rewards/simpleverify_reward/std": 0.4530589282512665, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2786.0, "completions/max_terminated_length": 2786.0, "completions/mean_length": 609.4140625, "completions/mean_terminated_length": 609.4140625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 12.914285714285715, "grad_norm": 0.17342890799045563, "learning_rate": 1e-06, "loss": 0.0147, "num_tokens": 907153044.0, "reward": 0.7500000596046448, "reward_std": 0.13429416716098785, "rewards/simpleverify_reward/mean": 0.75, "rewards/simpleverify_reward/std": 0.43325456976890564, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3676.0, "completions/mean_length": 604.515625, "completions/mean_terminated_length": 596.7047119140625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 12.923615160349854, "grad_norm": 0.1626547873020172, "learning_rate": 1e-06, "loss": 0.0266, "num_tokens": 907792578.0, "reward": 0.7912946939468384, "reward_std": 0.11167643964290619, "rewards/simpleverify_reward/mean": 0.7912946343421936, "rewards/simpleverify_reward/std": 0.4066103398799896, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3480.0, "completions/mean_length": 665.0502319335938, "completions/mean_terminated_length": 630.2378540039062, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 12.932944606413994, "grad_norm": 0.1831541508436203, "learning_rate": 1e-06, "loss": 0.0257, "num_tokens": 908480887.0, "reward": 0.7276785969734192, "reward_std": 0.13436946272850037, "rewards/simpleverify_reward/mean": 0.7276785969734192, "rewards/simpleverify_reward/std": 0.4454030692577362, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3320.0, "completions/mean_length": 639.1998291015625, "completions/mean_terminated_length": 631.4664306640625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 12.942274052478133, "grad_norm": 0.18518105149269104, "learning_rate": 1e-06, "loss": 0.0373, "num_tokens": 909142818.0, "reward": 0.7433035969734192, "reward_std": 0.14409995079040527, "rewards/simpleverify_reward/mean": 0.7433035969734192, "rewards/simpleverify_reward/std": 0.43705442547798157, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3295.0, "completions/mean_length": 651.9989013671875, "completions/mean_terminated_length": 644.294189453125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 12.951603498542275, "grad_norm": 0.18084841966629028, "learning_rate": 1e-06, "loss": 0.0219, "num_tokens": 909813225.0, "reward": 0.6752232313156128, "reward_std": 0.13830895721912384, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2254.0, "completions/mean_length": 590.8951416015625, "completions/mean_terminated_length": 575.1771850585938, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 12.960932944606414, "grad_norm": 0.14898180961608887, "learning_rate": 1e-06, "loss": 0.0225, "num_tokens": 910439251.0, "reward": 0.7165178656578064, "reward_std": 0.11735119670629501, "rewards/simpleverify_reward/mean": 0.7165178656578064, "rewards/simpleverify_reward/std": 0.4509401023387909, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3171.0, "completions/mean_length": 635.5960083007812, "completions/mean_terminated_length": 608.3487548828125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 12.970262390670554, "grad_norm": 0.148504376411438, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 911095961.0, "reward": 0.6674107313156128, "reward_std": 0.1170455813407898, "rewards/simpleverify_reward/mean": 0.6674107313156128, "rewards/simpleverify_reward/std": 0.47140392661094666, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3894.0, "completions/mean_length": 663.2098388671875, "completions/mean_terminated_length": 628.3787841796875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 12.979591836734693, "grad_norm": 0.16956795752048492, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 911781413.0, "reward": 0.6819196939468384, "reward_std": 0.12073546648025513, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2842.0, "completions/mean_length": 685.9107666015625, "completions/mean_terminated_length": 666.7744140625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 12.988921282798835, "grad_norm": 0.16132549941539764, "learning_rate": 1e-06, "loss": 0.0084, "num_tokens": 912484557.0, "reward": 0.7020089626312256, "reward_std": 0.13932561874389648, "rewards/simpleverify_reward/mean": 0.7020089030265808, "rewards/simpleverify_reward/std": 0.45763099193573, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0028409090909090606, "completions/max_length": 4096.0, "completions/max_terminated_length": 2760.0, "completions/mean_length": 677.8153686523438, "completions/mean_terminated_length": 668.076904296875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 12.998250728862974, "grad_norm": 0.1428583264350891, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 913139825.0, "reward": 0.6975446939468384, "reward_std": 0.11287854611873627, "rewards/simpleverify_reward/mean": 0.6975446343421936, "rewards/simpleverify_reward/std": 0.45957788825035095, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3206.0, "completions/mean_length": 683.6819458007812, "completions/mean_terminated_length": 633.4439086914062, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 13.00932944606414, "grad_norm": 0.14939911663532257, "learning_rate": 1e-06, "loss": 0.0282, "num_tokens": 913841052.0, "reward": 0.6897321939468384, "reward_std": 0.1405295431613922, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2645.0, "completions/mean_length": 678.2623291015625, "completions/mean_terminated_length": 659.0830688476562, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 13.018658892128279, "grad_norm": 0.14573998749256134, "learning_rate": 1e-06, "loss": 0.0165, "num_tokens": 914528143.0, "reward": 0.7678571939468384, "reward_std": 0.12595123052597046, "rewards/simpleverify_reward/mean": 0.7678571343421936, "rewards/simpleverify_reward/std": 0.422435462474823, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2604.0, "completions/mean_length": 698.1986694335938, "completions/mean_terminated_length": 686.7838745117188, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 13.02798833819242, "grad_norm": 0.17775645852088928, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 915244257.0, "reward": 0.6104910969734192, "reward_std": 0.1419203132390976, "rewards/simpleverify_reward/mean": 0.6104910969734192, "rewards/simpleverify_reward/std": 0.48791125416755676, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 619.5335083007812, "completions/mean_terminated_length": 596.0966186523438, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 13.03731778425656, "grad_norm": 0.14194048941135406, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 915886887.0, "reward": 0.715401828289032, "reward_std": 0.08777810633182526, "rewards/simpleverify_reward/mean": 0.7154017686843872, "rewards/simpleverify_reward/std": 0.4514748752117157, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3839.0, "completions/mean_length": 636.1004638671875, "completions/mean_terminated_length": 620.585205078125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 13.0466472303207, "grad_norm": 0.165063738822937, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 916548873.0, "reward": 0.707589328289032, "reward_std": 0.128134086728096, "rewards/simpleverify_reward/mean": 0.7075892686843872, "rewards/simpleverify_reward/std": 0.45512402057647705, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3103.0, "completions/mean_length": 592.4285888671875, "completions/mean_terminated_length": 584.5906372070312, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 13.055976676384839, "grad_norm": 0.15969377756118774, "learning_rate": 1e-06, "loss": 0.0237, "num_tokens": 917172641.0, "reward": 0.762276828289032, "reward_std": 0.1349763423204422, "rewards/simpleverify_reward/mean": 0.7622767686843872, "rewards/simpleverify_reward/std": 0.42592647671699524, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 603.950927734375, "completions/mean_terminated_length": 580.4089965820312, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 13.06530612244898, "grad_norm": 0.168379545211792, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 917791669.0, "reward": 0.7868303656578064, "reward_std": 0.13038787245750427, "rewards/simpleverify_reward/mean": 0.7868303656578064, "rewards/simpleverify_reward/std": 0.4097752273082733, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2386.0, "completions/mean_length": 653.5658569335938, "completions/mean_terminated_length": 638.1289672851562, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 13.07463556851312, "grad_norm": 0.17599038779735565, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 918465688.0, "reward": 0.7377232313156128, "reward_std": 0.1436121016740799, "rewards/simpleverify_reward/mean": 0.7377232313156128, "rewards/simpleverify_reward/std": 0.4401180148124695, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3822.0, "completions/mean_length": 675.1495971679688, "completions/mean_terminated_length": 648.2137451171875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 13.08396501457726, "grad_norm": 0.1363927125930786, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 919167558.0, "reward": 0.7299107313156128, "reward_std": 0.11329223960638046, "rewards/simpleverify_reward/mean": 0.7299107313156128, "rewards/simpleverify_reward/std": 0.44425368309020996, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2527.0, "completions/mean_length": 650.9375, "completions/mean_terminated_length": 647.0882568359375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 13.093294460641399, "grad_norm": 0.17180322110652924, "learning_rate": 1e-06, "loss": 0.0218, "num_tokens": 919832630.0, "reward": 0.7399553656578064, "reward_std": 0.12648479640483856, "rewards/simpleverify_reward/mean": 0.7399553656578064, "rewards/simpleverify_reward/std": 0.43890365958213806, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2987.0, "completions/mean_length": 645.3192138671875, "completions/mean_terminated_length": 629.8453369140625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 13.102623906705539, "grad_norm": 0.14834865927696228, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 920503060.0, "reward": 0.7533482313156128, "reward_std": 0.11069933325052261, "rewards/simpleverify_reward/mean": 0.7533482313156128, "rewards/simpleverify_reward/std": 0.4313030242919922, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2824.0, "completions/mean_length": 613.78125, "completions/mean_terminated_length": 605.9910278320312, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 13.11195335276968, "grad_norm": 0.1520848125219345, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 921148368.0, "reward": 0.684151828289032, "reward_std": 0.11779557168483734, "rewards/simpleverify_reward/mean": 0.6841517686843872, "rewards/simpleverify_reward/std": 0.4651124179363251, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3091.0, "completions/mean_length": 692.6060791015625, "completions/mean_terminated_length": 658.0732421875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 13.12128279883382, "grad_norm": 0.1204674020409584, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 921860463.0, "reward": 0.7109375596046448, "reward_std": 0.10092677175998688, "rewards/simpleverify_reward/mean": 0.7109375, "rewards/simpleverify_reward/std": 0.45358020067214966, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2540.0, "completions/mean_length": 741.8292846679688, "completions/mean_terminated_length": 726.7881469726562, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 13.130612244897959, "grad_norm": 0.14610907435417175, "learning_rate": 1e-06, "loss": 0.0171, "num_tokens": 922621062.0, "reward": 0.6272321939468384, "reward_std": 0.133773535490036, "rewards/simpleverify_reward/mean": 0.6272321343421936, "rewards/simpleverify_reward/std": 0.4838111698627472, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 4030.0, "completions/mean_length": 695.7433471679688, "completions/mean_terminated_length": 653.480224609375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 13.139941690962099, "grad_norm": 0.1684454381465912, "learning_rate": 1e-06, "loss": 0.0306, "num_tokens": 923338592.0, "reward": 0.6975446939468384, "reward_std": 0.1500752717256546, "rewards/simpleverify_reward/mean": 0.6975446343421936, "rewards/simpleverify_reward/std": 0.45957788825035095, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3384.0, "completions/mean_length": 612.8839721679688, "completions/mean_terminated_length": 601.1825561523438, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 13.14927113702624, "grad_norm": 0.15498003363609314, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 923968616.0, "reward": 0.7734375596046448, "reward_std": 0.11760760098695755, "rewards/simpleverify_reward/mean": 0.7734375, "rewards/simpleverify_reward/std": 0.4188409447669983, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 660.0201416015625, "completions/mean_terminated_length": 629.0653076171875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 13.15860058309038, "grad_norm": 0.14570002257823944, "learning_rate": 1e-06, "loss": 0.0206, "num_tokens": 924653898.0, "reward": 0.6808035969734192, "reward_std": 0.11088550090789795, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.4664256274700165, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3459.0, "completions/mean_length": 611.8348388671875, "completions/mean_terminated_length": 600.1299438476562, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 13.167930029154519, "grad_norm": 0.14791612327098846, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 925308694.0, "reward": 0.6718750596046448, "reward_std": 0.10795588791370392, "rewards/simpleverify_reward/mean": 0.671875, "rewards/simpleverify_reward/std": 0.46979284286499023, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2352.0, "completions/mean_length": 580.9765625, "completions/mean_terminated_length": 569.16796875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 13.177259475218658, "grad_norm": 0.15063202381134033, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 925923449.0, "reward": 0.7901785969734192, "reward_std": 0.10367162525653839, "rewards/simpleverify_reward/mean": 0.7901785969734192, "rewards/simpleverify_reward/std": 0.4074084460735321, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2852.0, "completions/mean_length": 613.9241333007812, "completions/mean_terminated_length": 594.3838500976562, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 13.186588921282798, "grad_norm": 0.17023438215255737, "learning_rate": 1e-06, "loss": 0.0439, "num_tokens": 926557909.0, "reward": 0.8024553656578064, "reward_std": 0.13891373574733734, "rewards/simpleverify_reward/mean": 0.8024553656578064, "rewards/simpleverify_reward/std": 0.39836904406547546, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3157.0, "completions/mean_length": 680.5089721679688, "completions/mean_terminated_length": 669.0347290039062, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 13.19591836734694, "grad_norm": 0.14717769622802734, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 927262493.0, "reward": 0.6785714626312256, "reward_std": 0.12787306308746338, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2840.0, "completions/mean_length": 631.5067138671875, "completions/mean_terminated_length": 627.6357421875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 13.205247813411079, "grad_norm": 0.17015454173088074, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 927917595.0, "reward": 0.7176339626312256, "reward_std": 0.13373075425624847, "rewards/simpleverify_reward/mean": 0.7176339030265808, "rewards/simpleverify_reward/std": 0.4504019320011139, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3744.0, "completions/mean_length": 660.474365234375, "completions/mean_terminated_length": 633.4229736328125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 13.214577259475218, "grad_norm": 0.1617119014263153, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 928598076.0, "reward": 0.723214328289032, "reward_std": 0.13459837436676025, "rewards/simpleverify_reward/mean": 0.7232142686843872, "rewards/simpleverify_reward/std": 0.44765952229499817, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3550.0, "completions/mean_length": 678.6808471679688, "completions/mean_terminated_length": 659.5039672851562, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 13.223906705539358, "grad_norm": 0.1703936904668808, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 929302670.0, "reward": 0.6941964626312256, "reward_std": 0.14815638959407806, "rewards/simpleverify_reward/mean": 0.6941964030265808, "rewards/simpleverify_reward/std": 0.4610042870044708, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3781.0, "completions/mean_length": 595.3270263671875, "completions/mean_terminated_length": 579.6289672851562, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 13.2332361516035, "grad_norm": 0.17466700077056885, "learning_rate": 1e-06, "loss": 0.0186, "num_tokens": 929931947.0, "reward": 0.7142857313156128, "reward_std": 0.12324139475822449, "rewards/simpleverify_reward/mean": 0.7142857313156128, "rewards/simpleverify_reward/std": 0.4520062506198883, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3122.0, "completions/mean_length": 637.9263916015625, "completions/mean_terminated_length": 622.4193115234375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 13.242565597667639, "grad_norm": 0.16151748597621918, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 930583769.0, "reward": 0.7299107313156128, "reward_std": 0.11434461921453476, "rewards/simpleverify_reward/mean": 0.7299107313156128, "rewards/simpleverify_reward/std": 0.44425368309020996, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3303.0, "completions/mean_length": 659.3951416015625, "completions/mean_terminated_length": 632.335205078125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 13.251895043731778, "grad_norm": 0.17136581242084503, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 931267811.0, "reward": 0.7354910969734192, "reward_std": 0.13395968079566956, "rewards/simpleverify_reward/mean": 0.7354910969734192, "rewards/simpleverify_reward/std": 0.44131770730018616, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 678.4453125, "completions/mean_terminated_length": 674.6267700195312, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 13.261224489795918, "grad_norm": 0.16370512545108795, "learning_rate": 1e-06, "loss": 0.0218, "num_tokens": 931964522.0, "reward": 0.7142857313156128, "reward_std": 0.1245201900601387, "rewards/simpleverify_reward/mean": 0.7142857313156128, "rewards/simpleverify_reward/std": 0.4520062506198883, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2888.0, "completions/mean_length": 573.4029541015625, "completions/mean_terminated_length": 569.467041015625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 13.270553935860057, "grad_norm": 0.15583083033561707, "learning_rate": 1e-06, "loss": 0.0272, "num_tokens": 932573171.0, "reward": 0.777901828289032, "reward_std": 0.10502780973911285, "rewards/simpleverify_reward/mean": 0.7779017686843872, "rewards/simpleverify_reward/std": 0.4158889651298523, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3712.0, "completions/mean_length": 581.4330444335938, "completions/mean_terminated_length": 573.5704956054688, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 13.279883381924199, "grad_norm": 0.14788256585597992, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 933174983.0, "reward": 0.7254464626312256, "reward_std": 0.08559959381818771, "rewards/simpleverify_reward/mean": 0.7254464030265808, "rewards/simpleverify_reward/std": 0.4465382993221283, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3605.0, "completions/mean_length": 607.0770263671875, "completions/mean_terminated_length": 595.3561401367188, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 13.289212827988338, "grad_norm": 0.1759456843137741, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 933810996.0, "reward": 0.7410714626312256, "reward_std": 0.13403454422950745, "rewards/simpleverify_reward/mean": 0.7410714030265808, "rewards/simpleverify_reward/std": 0.43829092383384705, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2499.0, "completions/mean_length": 602.3660888671875, "completions/mean_terminated_length": 590.6293334960938, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 13.298542274052478, "grad_norm": 0.1636083871126175, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 934440044.0, "reward": 0.7343750596046448, "reward_std": 0.11483317613601685, "rewards/simpleverify_reward/mean": 0.734375, "rewards/simpleverify_reward/std": 0.44191211462020874, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3818.0, "completions/mean_length": 659.4933471679688, "completions/mean_terminated_length": 636.3258666992188, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 13.307871720116617, "grad_norm": 0.167893186211586, "learning_rate": 1e-06, "loss": 0.0192, "num_tokens": 935124374.0, "reward": 0.6774553656578064, "reward_std": 0.12185201048851013, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 653.8214721679688, "completions/mean_terminated_length": 622.8108520507812, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 13.317201166180759, "grad_norm": 0.1828840672969818, "learning_rate": 1e-06, "loss": 0.041, "num_tokens": 935790422.0, "reward": 0.6752232313156128, "reward_std": 0.12971526384353638, "rewards/simpleverify_reward/mean": 0.6752232313156128, "rewards/simpleverify_reward/std": 0.46855294704437256, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3159.0, "completions/mean_length": 631.2734375, "completions/mean_terminated_length": 615.736572265625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 13.326530612244898, "grad_norm": 0.16376712918281555, "learning_rate": 1e-06, "loss": 0.0142, "num_tokens": 936440619.0, "reward": 0.7299107313156128, "reward_std": 0.12422848492860794, "rewards/simpleverify_reward/mean": 0.7299107313156128, "rewards/simpleverify_reward/std": 0.44425368309020996, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3034.0, "completions/mean_length": 678.2355346679688, "completions/mean_terminated_length": 655.1943969726562, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 13.335860058309038, "grad_norm": 0.1518632471561432, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 937135182.0, "reward": 0.6897321939468384, "reward_std": 0.12335903942584991, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4047.0, "completions/mean_length": 651.625, "completions/mean_terminated_length": 624.5039672851562, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 13.345189504373177, "grad_norm": 0.1767856478691101, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 937807318.0, "reward": 0.7120535969734192, "reward_std": 0.1362890601158142, "rewards/simpleverify_reward/mean": 0.7120535969734192, "rewards/simpleverify_reward/std": 0.4530589282512665, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 600.640625, "completions/mean_terminated_length": 561.1896362304688, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 13.354518950437317, "grad_norm": 0.14427430927753448, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 938427428.0, "reward": 0.7845982313156128, "reward_std": 0.09183567017316818, "rewards/simpleverify_reward/mean": 0.7845982313156128, "rewards/simpleverify_reward/std": 0.4113304018974304, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3139.0, "completions/mean_length": 627.6975708007812, "completions/mean_terminated_length": 616.0459594726562, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 13.363848396501458, "grad_norm": 0.1457006335258484, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 939069909.0, "reward": 0.7120535969734192, "reward_std": 0.10851971060037613, "rewards/simpleverify_reward/mean": 0.7120535969734192, "rewards/simpleverify_reward/std": 0.4530589282512665, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 554.5535888671875, "completions/mean_terminated_length": 542.65625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 13.373177842565598, "grad_norm": 0.16433773934841156, "learning_rate": 1e-06, "loss": 0.0097, "num_tokens": 939661885.0, "reward": 0.777901828289032, "reward_std": 0.09597131609916687, "rewards/simpleverify_reward/mean": 0.7779017686843872, "rewards/simpleverify_reward/std": 0.4158889949321747, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 658.2042846679688, "completions/mean_terminated_length": 638.9124755859375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 13.382507288629737, "grad_norm": 0.1506648063659668, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 940343644.0, "reward": 0.699776828289032, "reward_std": 0.11039765179157257, "rewards/simpleverify_reward/mean": 0.6997767686843872, "rewards/simpleverify_reward/std": 0.4586109220981598, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 605.864990234375, "completions/mean_terminated_length": 598.0570678710938, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 13.391836734693877, "grad_norm": 0.16747622191905975, "learning_rate": 1e-06, "loss": 0.0437, "num_tokens": 940980971.0, "reward": 0.7020089626312256, "reward_std": 0.1317012757062912, "rewards/simpleverify_reward/mean": 0.7020089030265808, "rewards/simpleverify_reward/std": 0.45763099193573, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2325.0, "completions/mean_length": 562.2667846679688, "completions/mean_terminated_length": 542.4365844726562, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 13.401166180758018, "grad_norm": 0.16857285797595978, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 941577138.0, "reward": 0.7287946939468384, "reward_std": 0.1288815587759018, "rewards/simpleverify_reward/mean": 0.7287946343421936, "rewards/simpleverify_reward/std": 0.44483017921447754, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3991.0, "completions/mean_length": 608.84375, "completions/mean_terminated_length": 601.04248046875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 13.410495626822158, "grad_norm": 0.15665094554424286, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 942201870.0, "reward": 0.7979910969734192, "reward_std": 0.10923398286104202, "rewards/simpleverify_reward/mean": 0.7979910969734192, "rewards/simpleverify_reward/std": 0.40172311663627625, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3856.0, "completions/mean_length": 639.7277221679688, "completions/mean_terminated_length": 635.8659057617188, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 13.419825072886297, "grad_norm": 0.14972831308841705, "learning_rate": 1e-06, "loss": 0.0249, "num_tokens": 942873754.0, "reward": 0.7142857313156128, "reward_std": 0.10772695392370224, "rewards/simpleverify_reward/mean": 0.7142857313156128, "rewards/simpleverify_reward/std": 0.4520062506198883, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 4087.0, "completions/mean_length": 647.7444458007812, "completions/mean_terminated_length": 632.2814331054688, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 13.429154518950437, "grad_norm": 0.30139660835266113, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 943540949.0, "reward": 0.7008928656578064, "reward_std": 0.1576283872127533, "rewards/simpleverify_reward/mean": 0.7008928656578064, "rewards/simpleverify_reward/std": 0.4581226110458374, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3757.0, "completions/mean_length": 608.638427734375, "completions/mean_terminated_length": 585.1281127929688, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 13.438483965014576, "grad_norm": 0.19851642847061157, "learning_rate": 1e-06, "loss": 0.0401, "num_tokens": 944177857.0, "reward": 0.7265625596046448, "reward_std": 0.12076614797115326, "rewards/simpleverify_reward/mean": 0.7265625, "rewards/simpleverify_reward/std": 0.4459724426269531, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3739.0, "completions/mean_length": 687.3035888671875, "completions/mean_terminated_length": 660.4634399414062, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 13.447813411078718, "grad_norm": 0.15827538073062897, "learning_rate": 1e-06, "loss": 0.0341, "num_tokens": 944889217.0, "reward": 0.6584821939468384, "reward_std": 0.13947826623916626, "rewards/simpleverify_reward/mean": 0.6584821343421936, "rewards/simpleverify_reward/std": 0.4744836091995239, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3589.0, "completions/mean_length": 682.6171875, "completions/mean_terminated_length": 651.8660278320312, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 13.457142857142857, "grad_norm": 0.14599883556365967, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 945584194.0, "reward": 0.7555803656578064, "reward_std": 0.12035568058490753, "rewards/simpleverify_reward/mean": 0.7555803656578064, "rewards/simpleverify_reward/std": 0.42998257279396057, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3540.0, "completions/mean_length": 694.1027221679688, "completions/mean_terminated_length": 663.4549560546875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 13.466472303206997, "grad_norm": 0.15025648474693298, "learning_rate": 1e-06, "loss": 0.0213, "num_tokens": 946297814.0, "reward": 0.6830357313156128, "reward_std": 0.10739177465438843, "rewards/simpleverify_reward/mean": 0.6830357313156128, "rewards/simpleverify_reward/std": 0.46555325388908386, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3169.0, "completions/mean_length": 672.2410888671875, "completions/mean_terminated_length": 645.2823486328125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 13.475801749271136, "grad_norm": 0.12008307129144669, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 946986878.0, "reward": 0.7209821939468384, "reward_std": 0.08431828767061234, "rewards/simpleverify_reward/mean": 0.7209821343421936, "rewards/simpleverify_reward/std": 0.448766827583313, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 4019.0, "completions/mean_length": 630.4375, "completions/mean_terminated_length": 618.7951049804688, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 13.485131195335278, "grad_norm": 0.18307197093963623, "learning_rate": 1e-06, "loss": 0.0192, "num_tokens": 947651566.0, "reward": 0.738839328289032, "reward_std": 0.13380561769008636, "rewards/simpleverify_reward/mean": 0.7388392686843872, "rewards/simpleverify_reward/std": 0.439512699842453, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 623.9765625, "completions/mean_terminated_length": 612.3124389648438, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 13.494460641399417, "grad_norm": 0.17601969838142395, "learning_rate": 1e-06, "loss": 0.0255, "num_tokens": 948298737.0, "reward": 0.7477678656578064, "reward_std": 0.13383883237838745, "rewards/simpleverify_reward/mean": 0.7477678656578064, "rewards/simpleverify_reward/std": 0.4345363676548004, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 666.1015625, "completions/mean_terminated_length": 631.2998657226562, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 13.503790087463557, "grad_norm": 0.16366824507713318, "learning_rate": 1e-06, "loss": 0.0362, "num_tokens": 948986508.0, "reward": 0.723214328289032, "reward_std": 0.1013825312256813, "rewards/simpleverify_reward/mean": 0.7232142686843872, "rewards/simpleverify_reward/std": 0.44765952229499817, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3711.0, "completions/mean_length": 686.7210083007812, "completions/mean_terminated_length": 659.8762817382812, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 13.513119533527696, "grad_norm": 0.16776034235954285, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 949685674.0, "reward": 0.6662946939468384, "reward_std": 0.12497735768556595, "rewards/simpleverify_reward/mean": 0.6662946343421936, "rewards/simpleverify_reward/std": 0.47179922461509705, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3485.0, "completions/mean_length": 643.6015625, "completions/mean_terminated_length": 620.3269653320312, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 13.522448979591836, "grad_norm": 0.14713482558727264, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 950364285.0, "reward": 0.6785714626312256, "reward_std": 0.1187373623251915, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3869.0, "completions/mean_length": 662.8717041015625, "completions/mean_terminated_length": 647.4765014648438, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 13.531778425655977, "grad_norm": 0.16655661165714264, "learning_rate": 1e-06, "loss": 0.0208, "num_tokens": 951039218.0, "reward": 0.7064732313156128, "reward_std": 0.10171841084957123, "rewards/simpleverify_reward/mean": 0.7064732313156128, "rewards/simpleverify_reward/std": 0.4556320011615753, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 585.1652221679688, "completions/mean_terminated_length": 569.4215698242188, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 13.541107871720117, "grad_norm": 0.15159335732460022, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 951653206.0, "reward": 0.7477678656578064, "reward_std": 0.1194523274898529, "rewards/simpleverify_reward/mean": 0.7477678656578064, "rewards/simpleverify_reward/std": 0.434536337852478, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3267.0, "completions/mean_length": 663.4944458007812, "completions/mean_terminated_length": 616.8993530273438, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 13.550437317784256, "grad_norm": 0.16274656355381012, "learning_rate": 1e-06, "loss": 0.03, "num_tokens": 952348185.0, "reward": 0.7087053656578064, "reward_std": 0.12174323946237564, "rewards/simpleverify_reward/mean": 0.7087053656578064, "rewards/simpleverify_reward/std": 0.45461276173591614, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3910.0, "completions/mean_length": 643.0982666015625, "completions/mean_terminated_length": 627.6143798828125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 13.559766763848396, "grad_norm": 0.1936870962381363, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 953004665.0, "reward": 0.7734375596046448, "reward_std": 0.12302455306053162, "rewards/simpleverify_reward/mean": 0.7734375, "rewards/simpleverify_reward/std": 0.4188409447669983, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2930.0, "completions/mean_length": 643.3861694335938, "completions/mean_terminated_length": 612.2815551757812, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 13.569096209912537, "grad_norm": 0.17433889210224152, "learning_rate": 1e-06, "loss": 0.0269, "num_tokens": 953664371.0, "reward": 0.7209821939468384, "reward_std": 0.12869539856910706, "rewards/simpleverify_reward/mean": 0.7209821343421936, "rewards/simpleverify_reward/std": 0.448766827583313, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2667.0, "completions/mean_length": 651.8839721679688, "completions/mean_terminated_length": 601.1777954101562, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 13.578425655976677, "grad_norm": 0.14926782250404358, "learning_rate": 1e-06, "loss": 0.0419, "num_tokens": 954334707.0, "reward": 0.7477678656578064, "reward_std": 0.10408349335193634, "rewards/simpleverify_reward/mean": 0.7477678656578064, "rewards/simpleverify_reward/std": 0.4345363676548004, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2445.0, "completions/mean_length": 631.5167846679688, "completions/mean_terminated_length": 604.2373657226562, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 13.587755102040816, "grad_norm": 0.16701577603816986, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 954994058.0, "reward": 0.7287946939468384, "reward_std": 0.119899220764637, "rewards/simpleverify_reward/mean": 0.7287946343421936, "rewards/simpleverify_reward/std": 0.44483017921447754, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3969.0, "completions/mean_length": 655.5067138671875, "completions/mean_terminated_length": 636.1998291015625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 13.597084548104956, "grad_norm": 0.15908285975456238, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 955677664.0, "reward": 0.7187500596046448, "reward_std": 0.11681917309761047, "rewards/simpleverify_reward/mean": 0.71875, "rewards/simpleverify_reward/std": 0.4498603343963623, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3385.0, "completions/mean_length": 679.4285888671875, "completions/mean_terminated_length": 664.107666015625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 13.606413994169095, "grad_norm": 0.15520451962947845, "learning_rate": 1e-06, "loss": 0.0226, "num_tokens": 956382808.0, "reward": 0.6808035969734192, "reward_std": 0.13515858352184296, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.46642565727233887, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3890.0, "completions/mean_length": 695.0111694335938, "completions/mean_terminated_length": 672.0831298828125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 13.615743440233237, "grad_norm": 0.17393681406974792, "learning_rate": 1e-06, "loss": 0.0322, "num_tokens": 957092034.0, "reward": 0.6897321939468384, "reward_std": 0.1584521234035492, "rewards/simpleverify_reward/mean": 0.6897321343421936, "rewards/simpleverify_reward/std": 0.4628615975379944, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3696.0, "completions/mean_length": 568.1004638671875, "completions/mean_terminated_length": 556.2485961914062, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 13.625072886297376, "grad_norm": 0.17130640149116516, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 957690148.0, "reward": 0.7611607313156128, "reward_std": 0.11193676292896271, "rewards/simpleverify_reward/mean": 0.7611607313156128, "rewards/simpleverify_reward/std": 0.4266124963760376, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3833.0, "completions/mean_length": 705.8516235351562, "completions/mean_terminated_length": 686.8272094726562, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 13.634402332361516, "grad_norm": 0.14605218172073364, "learning_rate": 1e-06, "loss": 0.0219, "num_tokens": 958418423.0, "reward": 0.6785714626312256, "reward_std": 0.11363105475902557, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3653.0, "completions/mean_length": 704.5592041015625, "completions/mean_terminated_length": 685.5275268554688, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 13.643731778425655, "grad_norm": 0.16035214066505432, "learning_rate": 1e-06, "loss": 0.0207, "num_tokens": 959144532.0, "reward": 0.7031250596046448, "reward_std": 0.14612944424152374, "rewards/simpleverify_reward/mean": 0.703125, "rewards/simpleverify_reward/std": 0.4571361541748047, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3138.0, "completions/mean_length": 667.2824096679688, "completions/mean_terminated_length": 655.7637329101562, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 13.653061224489797, "grad_norm": 0.18535490334033966, "learning_rate": 1e-06, "loss": 0.0087, "num_tokens": 959822929.0, "reward": 0.7343750596046448, "reward_std": 0.12591028213500977, "rewards/simpleverify_reward/mean": 0.734375, "rewards/simpleverify_reward/std": 0.44191211462020874, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2164.0, "completions/mean_length": 601.052490234375, "completions/mean_terminated_length": 597.1474609375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 13.662390670553936, "grad_norm": 0.1802152842283249, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 960452800.0, "reward": 0.7243303656578064, "reward_std": 0.13801221549510956, "rewards/simpleverify_reward/mean": 0.7243303656578064, "rewards/simpleverify_reward/std": 0.4471006691455841, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2926.0, "completions/mean_length": 684.6875610351562, "completions/mean_terminated_length": 673.2273559570312, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 13.671720116618076, "grad_norm": 0.15701845288276672, "learning_rate": 1e-06, "loss": 0.0293, "num_tokens": 961153720.0, "reward": 0.6930803656578064, "reward_std": 0.15007779002189636, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147337555885315, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3074.0, "completions/mean_length": 668.8225708007812, "completions/mean_terminated_length": 649.5903930664062, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 13.681049562682215, "grad_norm": 0.1516803652048111, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 961837673.0, "reward": 0.660714328289032, "reward_std": 0.11532030999660492, "rewards/simpleverify_reward/mean": 0.6607142686843872, "rewards/simpleverify_reward/std": 0.4737313687801361, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 628.6796875, "completions/mean_terminated_length": 597.4425659179688, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 13.690379008746355, "grad_norm": 0.1836187094449997, "learning_rate": 1e-06, "loss": 0.0326, "num_tokens": 962497410.0, "reward": 0.7254464626312256, "reward_std": 0.14507704973220825, "rewards/simpleverify_reward/mean": 0.7254464030265808, "rewards/simpleverify_reward/std": 0.4465382993221283, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3500.0, "completions/mean_length": 618.2053833007812, "completions/mean_terminated_length": 594.7595825195312, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 13.699708454810496, "grad_norm": 0.14726492762565613, "learning_rate": 1e-06, "loss": 0.0194, "num_tokens": 963129690.0, "reward": 0.7678571939468384, "reward_std": 0.11095965653657913, "rewards/simpleverify_reward/mean": 0.7678571343421936, "rewards/simpleverify_reward/std": 0.422435462474823, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3599.0, "completions/mean_length": 659.6105346679688, "completions/mean_terminated_length": 651.9228515625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 13.709037900874636, "grad_norm": 0.1796456128358841, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 963804685.0, "reward": 0.7131696939468384, "reward_std": 0.1466953605413437, "rewards/simpleverify_reward/mean": 0.7131696343421936, "rewards/simpleverify_reward/std": 0.4525342583656311, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2672.0, "completions/mean_length": 656.359375, "completions/mean_terminated_length": 613.6068115234375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 13.718367346938775, "grad_norm": 0.15964068472385406, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 964494303.0, "reward": 0.676339328289032, "reward_std": 0.11670787632465363, "rewards/simpleverify_reward/mean": 0.6763392686843872, "rewards/simpleverify_reward/std": 0.4681335985660553, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2931.0, "completions/mean_length": 727.6785888671875, "completions/mean_terminated_length": 723.9150390625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 13.727696793002915, "grad_norm": 0.1502268761396408, "learning_rate": 1e-06, "loss": 0.0264, "num_tokens": 965236911.0, "reward": 0.6808035969734192, "reward_std": 0.1401190608739853, "rewards/simpleverify_reward/mean": 0.6808035969734192, "rewards/simpleverify_reward/std": 0.4664256274700165, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2326.0, "completions/mean_length": 638.3783569335938, "completions/mean_terminated_length": 626.7626342773438, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 13.737026239067056, "grad_norm": 0.15499494969844818, "learning_rate": 1e-06, "loss": 0.019, "num_tokens": 965888610.0, "reward": 0.7265625596046448, "reward_std": 0.10754331946372986, "rewards/simpleverify_reward/mean": 0.7265625, "rewards/simpleverify_reward/std": 0.4459724426269531, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2232.0, "completions/mean_length": 628.9888916015625, "completions/mean_terminated_length": 617.341552734375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 13.746355685131196, "grad_norm": 0.19127896428108215, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 966540520.0, "reward": 0.770089328289032, "reward_std": 0.14429426193237305, "rewards/simpleverify_reward/mean": 0.7700892686843872, "rewards/simpleverify_reward/std": 0.42101022601127625, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3028.0, "completions/mean_length": 674.1908569335938, "completions/mean_terminated_length": 651.1224975585938, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 13.755685131195335, "grad_norm": 0.15879015624523163, "learning_rate": 1e-06, "loss": 0.0266, "num_tokens": 967243339.0, "reward": 0.6930803656578064, "reward_std": 0.13271933794021606, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147337555885315, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 676.732177734375, "completions/mean_terminated_length": 665.2452392578125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 13.765014577259475, "grad_norm": 0.16933493316173553, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 967933683.0, "reward": 0.6863839626312256, "reward_std": 0.11991060525178909, "rewards/simpleverify_reward/mean": 0.6863839030265808, "rewards/simpleverify_reward/std": 0.4642214775085449, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3778.0, "completions/mean_length": 682.786865234375, "completions/mean_terminated_length": 671.3203125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 13.774344023323614, "grad_norm": 0.13573476672172546, "learning_rate": 1e-06, "loss": 0.0181, "num_tokens": 968643364.0, "reward": 0.7042410969734192, "reward_std": 0.10626200586557388, "rewards/simpleverify_reward/mean": 0.7042410969734192, "rewards/simpleverify_reward/std": 0.45663803815841675, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3722.0, "completions/mean_length": 739.6261596679688, "completions/mean_terminated_length": 713.197998046875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 13.783673469387756, "grad_norm": 0.134423166513443, "learning_rate": 1e-06, "loss": 0.0187, "num_tokens": 969399813.0, "reward": 0.6651785969734192, "reward_std": 0.11734867841005325, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219157218933105, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3133.0, "completions/mean_length": 594.8203125, "completions/mean_terminated_length": 575.1728515625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 13.793002915451895, "grad_norm": 0.20029303431510925, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 970029300.0, "reward": 0.7633928656578064, "reward_std": 0.12726575136184692, "rewards/simpleverify_reward/mean": 0.7633928656578064, "rewards/simpleverify_reward/std": 0.42523646354675293, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3243.0, "completions/mean_length": 741.122802734375, "completions/mean_terminated_length": 714.7064208984375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 13.802332361516035, "grad_norm": 0.14960920810699463, "learning_rate": 1e-06, "loss": 0.027, "num_tokens": 970788658.0, "reward": 0.6796875596046448, "reward_std": 0.1335018277168274, "rewards/simpleverify_reward/mean": 0.6796875, "rewards/simpleverify_reward/std": 0.4668572247028351, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3748.0, "completions/mean_length": 644.1239013671875, "completions/mean_terminated_length": 624.7531127929688, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 13.811661807580174, "grad_norm": 0.1607269048690796, "learning_rate": 1e-06, "loss": 0.0098, "num_tokens": 971444233.0, "reward": 0.7566964626312256, "reward_std": 0.11032027006149292, "rewards/simpleverify_reward/mean": 0.7566964030265808, "rewards/simpleverify_reward/std": 0.4293164908885956, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 694.3314819335938, "completions/mean_terminated_length": 667.5466918945312, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 13.820991253644316, "grad_norm": 0.1799241304397583, "learning_rate": 1e-06, "loss": 0.0318, "num_tokens": 972153890.0, "reward": 0.6908482313156128, "reward_std": 0.11960498243570328, "rewards/simpleverify_reward/mean": 0.6908482313156128, "rewards/simpleverify_reward/std": 0.46240198612213135, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 646.1975708007812, "completions/mean_terminated_length": 634.6080932617188, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 13.830320699708455, "grad_norm": 0.1635061502456665, "learning_rate": 1e-06, "loss": 0.0216, "num_tokens": 972817291.0, "reward": 0.7801339626312256, "reward_std": 0.10705476254224777, "rewards/simpleverify_reward/mean": 0.7801339030265808, "rewards/simpleverify_reward/std": 0.4143870770931244, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2911.0, "completions/mean_length": 647.458740234375, "completions/mean_terminated_length": 624.2101440429688, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 13.839650145772595, "grad_norm": 0.1664106547832489, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 973480206.0, "reward": 0.731026828289032, "reward_std": 0.13474741578102112, "rewards/simpleverify_reward/mean": 0.7310267686843872, "rewards/simpleverify_reward/std": 0.44367367029190063, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3192.0, "completions/mean_length": 815.5267944335938, "completions/mean_terminated_length": 793.4112548828125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 13.848979591836734, "grad_norm": 0.14940832555294037, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 974310702.0, "reward": 0.6082589626312256, "reward_std": 0.12009605765342712, "rewards/simpleverify_reward/mean": 0.6082589030265808, "rewards/simpleverify_reward/std": 0.4884119927883148, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 681.4285888671875, "completions/mean_terminated_length": 650.6666870117188, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 13.858309037900874, "grad_norm": 0.14222438633441925, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 975014358.0, "reward": 0.691964328289032, "reward_std": 0.13109466433525085, "rewards/simpleverify_reward/mean": 0.6919642686843872, "rewards/simpleverify_reward/std": 0.4619392454624176, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3408.0, "completions/mean_length": 641.3683471679688, "completions/mean_terminated_length": 621.9820556640625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 13.867638483965015, "grad_norm": 0.1721368432044983, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 975678592.0, "reward": 0.738839328289032, "reward_std": 0.12632961571216583, "rewards/simpleverify_reward/mean": 0.7388392686843872, "rewards/simpleverify_reward/std": 0.439512699842453, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3103.0, "completions/mean_length": 634.8292846679688, "completions/mean_terminated_length": 619.308349609375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 13.876967930029155, "grad_norm": 0.16731929779052734, "learning_rate": 1e-06, "loss": 0.0315, "num_tokens": 976328799.0, "reward": 0.7087053656578064, "reward_std": 0.12242470681667328, "rewards/simpleverify_reward/mean": 0.7087053656578064, "rewards/simpleverify_reward/std": 0.45461276173591614, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2972.0, "completions/mean_length": 732.8035888671875, "completions/mean_terminated_length": 702.5045166015625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 13.886297376093294, "grad_norm": 0.1430034339427948, "learning_rate": 1e-06, "loss": 0.0337, "num_tokens": 977072303.0, "reward": 0.6551339626312256, "reward_std": 0.11786973476409912, "rewards/simpleverify_reward/mean": 0.6551339030265808, "rewards/simpleverify_reward/std": 0.4755900800228119, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3375.0, "completions/mean_length": 681.638427734375, "completions/mean_terminated_length": 639.2000122070312, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 13.895626822157434, "grad_norm": 0.16463656723499298, "learning_rate": 1e-06, "loss": 0.0199, "num_tokens": 977772419.0, "reward": 0.6785714626312256, "reward_std": 0.13718664646148682, "rewards/simpleverify_reward/mean": 0.6785714030265808, "rewards/simpleverify_reward/std": 0.46728572249412537, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3646.0, "completions/mean_length": 618.9241333007812, "completions/mean_terminated_length": 607.2430419921875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 13.904956268221575, "grad_norm": 0.12439268827438354, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 978404711.0, "reward": 0.7343750596046448, "reward_std": 0.0805649384856224, "rewards/simpleverify_reward/mean": 0.734375, "rewards/simpleverify_reward/std": 0.44191211462020874, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2204.0, "completions/mean_length": 640.325927734375, "completions/mean_terminated_length": 617.0292358398438, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 13.914285714285715, "grad_norm": 0.16905450820922852, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 979073355.0, "reward": 0.7031250596046448, "reward_std": 0.13583369553089142, "rewards/simpleverify_reward/mean": 0.703125, "rewards/simpleverify_reward/std": 0.4571361541748047, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2776.0, "completions/mean_length": 631.1629638671875, "completions/mean_terminated_length": 607.8045043945312, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 13.923615160349854, "grad_norm": 0.17306599020957947, "learning_rate": 1e-06, "loss": 0.0234, "num_tokens": 979727277.0, "reward": 0.7444196939468384, "reward_std": 0.10464803129434586, "rewards/simpleverify_reward/mean": 0.7444196343421936, "rewards/simpleverify_reward/std": 0.43643057346343994, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3198.0, "completions/mean_length": 678.2522583007812, "completions/mean_terminated_length": 670.6062622070312, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 13.932944606413994, "grad_norm": 0.17379014194011688, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 980422239.0, "reward": 0.7053571939468384, "reward_std": 0.14214715361595154, "rewards/simpleverify_reward/mean": 0.7053571343421936, "rewards/simpleverify_reward/std": 0.45613664388656616, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3485.0, "completions/mean_length": 657.7545166015625, "completions/mean_terminated_length": 646.203857421875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 13.942274052478133, "grad_norm": 0.15818026661872864, "learning_rate": 1e-06, "loss": 0.0315, "num_tokens": 981100755.0, "reward": 0.6930803656578064, "reward_std": 0.12546591460704803, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147337555885315, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2989.0, "completions/mean_length": 632.3058471679688, "completions/mean_terminated_length": 605.0326538085938, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 13.951603498542275, "grad_norm": 0.15902744233608246, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 981754605.0, "reward": 0.7332589626312256, "reward_std": 0.11145003139972687, "rewards/simpleverify_reward/mean": 0.7332589030265808, "rewards/simpleverify_reward/std": 0.4425029158592224, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3107.0, "completions/mean_length": 726.7969360351562, "completions/mean_terminated_length": 696.4437255859375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 13.960932944606414, "grad_norm": 0.14821434020996094, "learning_rate": 1e-06, "loss": 0.033, "num_tokens": 982502079.0, "reward": 0.7098214626312256, "reward_std": 0.12354449182748795, "rewards/simpleverify_reward/mean": 0.7098214030265808, "rewards/simpleverify_reward/std": 0.454098105430603, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 757.1719360351562, "completions/mean_terminated_length": 730.8818969726562, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 13.970262390670554, "grad_norm": 0.15266871452331543, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 983261593.0, "reward": 0.6473214626312256, "reward_std": 0.11599431931972504, "rewards/simpleverify_reward/mean": 0.6473214030265808, "rewards/simpleverify_reward/std": 0.47807058691978455, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3960.0, "completions/mean_length": 618.1339721679688, "completions/mean_terminated_length": 598.6173095703125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 13.979591836734693, "grad_norm": 0.14521849155426025, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 983900641.0, "reward": 0.7343750596046448, "reward_std": 0.10370371490716934, "rewards/simpleverify_reward/mean": 0.734375, "rewards/simpleverify_reward/std": 0.44191211462020874, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3058.0, "completions/mean_length": 745.5223388671875, "completions/mean_terminated_length": 741.7787475585938, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 13.988921282798835, "grad_norm": 0.14419567584991455, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 984661677.0, "reward": 0.6473214626312256, "reward_std": 0.13305604457855225, "rewards/simpleverify_reward/mean": 0.6473214030265808, "rewards/simpleverify_reward/std": 0.47807061672210693, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011363636363636354, "completions/max_length": 4096.0, "completions/max_terminated_length": 3098.0, "completions/mean_length": 742.3948974609375, "completions/mean_terminated_length": 703.8477172851562, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 13.998250728862974, "grad_norm": 0.1548193395137787, "learning_rate": 1e-06, "loss": 0.0243, "num_tokens": 985388849.0, "reward": 0.7343750596046448, "reward_std": 0.11190466582775116, "rewards/simpleverify_reward/mean": 0.734375, "rewards/simpleverify_reward/std": 0.44191211462020874, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4022.0, "completions/mean_length": 703.974365234375, "completions/mean_terminated_length": 681.1067504882812, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 14.00932944606414, "grad_norm": 0.14920587837696075, "learning_rate": 1e-06, "loss": 0.0149, "num_tokens": 986107522.0, "reward": 0.6741071939468384, "reward_std": 0.10521145164966583, "rewards/simpleverify_reward/mean": 0.6741071343421936, "rewards/simpleverify_reward/std": 0.4689692556858063, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2376.0, "completions/mean_length": 631.9710083007812, "completions/mean_terminated_length": 616.437255859375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 14.018658892128279, "grad_norm": 0.14063246548175812, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 986769456.0, "reward": 0.7745535969734192, "reward_std": 0.11140473932027817, "rewards/simpleverify_reward/mean": 0.7745535969734192, "rewards/simpleverify_reward/std": 0.41810935735702515, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3538.0, "completions/mean_length": 734.6172485351562, "completions/mean_terminated_length": 715.7542114257812, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 14.02798833819242, "grad_norm": 0.141231507062912, "learning_rate": 1e-06, "loss": 0.0371, "num_tokens": 987526201.0, "reward": 0.6651785969734192, "reward_std": 0.10739177465438843, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219157218933105, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3955.0, "completions/mean_length": 702.2288208007812, "completions/mean_terminated_length": 671.654296875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 14.03731778425656, "grad_norm": 0.15254561603069305, "learning_rate": 1e-06, "loss": 0.0283, "num_tokens": 988236526.0, "reward": 0.7354910969734192, "reward_std": 0.13733959197998047, "rewards/simpleverify_reward/mean": 0.7354910969734192, "rewards/simpleverify_reward/std": 0.44131770730018616, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3659.0, "completions/mean_length": 636.357177734375, "completions/mean_terminated_length": 613.03369140625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 14.0466472303207, "grad_norm": 0.15811985731124878, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 988896782.0, "reward": 0.7488839626312256, "reward_std": 0.1231333315372467, "rewards/simpleverify_reward/mean": 0.7488839030265808, "rewards/simpleverify_reward/std": 0.43389734625816345, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3376.0, "completions/mean_length": 666.7723388671875, "completions/mean_terminated_length": 647.5286254882812, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 14.055976676384839, "grad_norm": 0.1538214534521103, "learning_rate": 1e-06, "loss": 0.0205, "num_tokens": 989579370.0, "reward": 0.7031250596046448, "reward_std": 0.11227306723594666, "rewards/simpleverify_reward/mean": 0.703125, "rewards/simpleverify_reward/std": 0.4571361541748047, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3193.0, "completions/mean_length": 660.4464721679688, "completions/mean_terminated_length": 641.167236328125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 14.06530612244898, "grad_norm": 0.16066239774227142, "learning_rate": 1e-06, "loss": 0.0175, "num_tokens": 990258570.0, "reward": 0.7455357313156128, "reward_std": 0.1164814680814743, "rewards/simpleverify_reward/mean": 0.7455357313156128, "rewards/simpleverify_reward/std": 0.4358029067516327, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3320.0, "completions/mean_length": 664.2377319335938, "completions/mean_terminated_length": 652.7088623046875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 14.07463556851312, "grad_norm": 0.15418395400047302, "learning_rate": 1e-06, "loss": 0.0267, "num_tokens": 990948495.0, "reward": 0.7064732313156128, "reward_std": 0.10423504561185837, "rewards/simpleverify_reward/mean": 0.7064732313156128, "rewards/simpleverify_reward/std": 0.4556320011615753, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4011.0, "completions/mean_length": 722.1864013671875, "completions/mean_terminated_length": 695.6209716796875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 14.08396501457726, "grad_norm": 0.1535099893808365, "learning_rate": 1e-06, "loss": 0.0202, "num_tokens": 991683750.0, "reward": 0.738839328289032, "reward_std": 0.14533667266368866, "rewards/simpleverify_reward/mean": 0.7388392686843872, "rewards/simpleverify_reward/std": 0.439512699842453, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3786.0, "completions/mean_length": 645.9676513671875, "completions/mean_terminated_length": 610.961669921875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 14.093294460641399, "grad_norm": 0.14591118693351746, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 992350441.0, "reward": 0.7410714626312256, "reward_std": 0.10746844857931137, "rewards/simpleverify_reward/mean": 0.7410714030265808, "rewards/simpleverify_reward/std": 0.43829095363616943, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2971.0, "completions/mean_length": 663.5256958007812, "completions/mean_terminated_length": 644.2637939453125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 14.102623906705539, "grad_norm": 0.15462873876094818, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 993036392.0, "reward": 0.7522321939468384, "reward_std": 0.10299650579690933, "rewards/simpleverify_reward/mean": 0.7522321343421936, "rewards/simpleverify_reward/std": 0.4319573938846588, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2840.0, "completions/mean_length": 644.1105346679688, "completions/mean_terminated_length": 632.5140380859375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 14.11195335276968, "grad_norm": 0.17140205204486847, "learning_rate": 1e-06, "loss": 0.0069, "num_tokens": 993695747.0, "reward": 0.7790178656578064, "reward_std": 0.0990067571401596, "rewards/simpleverify_reward/mean": 0.7790178656578064, "rewards/simpleverify_reward/std": 0.41514021158218384, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3674.0, "completions/mean_length": 710.7578735351562, "completions/mean_terminated_length": 691.760986328125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 14.12128279883382, "grad_norm": 0.14265882968902588, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 994416138.0, "reward": 0.6941964626312256, "reward_std": 0.12332695722579956, "rewards/simpleverify_reward/mean": 0.6941964030265808, "rewards/simpleverify_reward/std": 0.4610042870044708, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3639.0, "completions/mean_length": 603.4631958007812, "completions/mean_terminated_length": 583.8641967773438, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 14.130612244897959, "grad_norm": 0.17454662919044495, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 995037161.0, "reward": 0.7812500596046448, "reward_std": 0.10626522451639175, "rewards/simpleverify_reward/mean": 0.78125, "rewards/simpleverify_reward/std": 0.41362953186035156, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3800.0, "completions/mean_length": 637.1506958007812, "completions/mean_terminated_length": 629.4127807617188, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 14.139941690962099, "grad_norm": 0.1671658158302307, "learning_rate": 1e-06, "loss": 0.0055, "num_tokens": 995701896.0, "reward": 0.6819196939468384, "reward_std": 0.12128929793834686, "rewards/simpleverify_reward/mean": 0.6819196343421936, "rewards/simpleverify_reward/std": 0.46599099040031433, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3735.0, "completions/mean_length": 762.5045166015625, "completions/mean_terminated_length": 721.0712280273438, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 14.14927113702624, "grad_norm": 0.14777782559394836, "learning_rate": 1e-06, "loss": 0.0348, "num_tokens": 996479268.0, "reward": 0.6953125596046448, "reward_std": 0.12024439126253128, "rewards/simpleverify_reward/mean": 0.6953125, "rewards/simpleverify_reward/std": 0.4605320394039154, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2711.0, "completions/mean_length": 734.818115234375, "completions/mean_terminated_length": 708.3521118164062, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 14.15860058309038, "grad_norm": 0.17617565393447876, "learning_rate": 1e-06, "loss": 0.0211, "num_tokens": 997228105.0, "reward": 0.699776828289032, "reward_std": 0.16235703229904175, "rewards/simpleverify_reward/mean": 0.6997767686843872, "rewards/simpleverify_reward/std": 0.4586109220981598, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3765.0, "completions/mean_length": 653.6038208007812, "completions/mean_terminated_length": 638.1670532226562, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 14.167930029154519, "grad_norm": 0.1751776933670044, "learning_rate": 1e-06, "loss": 0.0242, "num_tokens": 997902998.0, "reward": 0.7555803656578064, "reward_std": 0.10964696109294891, "rewards/simpleverify_reward/mean": 0.7555803656578064, "rewards/simpleverify_reward/std": 0.42998260259628296, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2694.0, "completions/mean_length": 639.7377319335938, "completions/mean_terminated_length": 632.005615234375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 14.177259475218658, "grad_norm": 0.1895642727613449, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 998564395.0, "reward": 0.7511160969734192, "reward_std": 0.1507938802242279, "rewards/simpleverify_reward/mean": 0.7511160969734192, "rewards/simpleverify_reward/std": 0.43260788917541504, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2832.0, "completions/mean_length": 731.521240234375, "completions/mean_terminated_length": 701.2106323242188, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 14.186588921282798, "grad_norm": 0.1477023810148239, "learning_rate": 1e-06, "loss": 0.0262, "num_tokens": 999305278.0, "reward": 0.6975446939468384, "reward_std": 0.12793654203414917, "rewards/simpleverify_reward/mean": 0.6975446343421936, "rewards/simpleverify_reward/std": 0.45957791805267334, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3703.0, "completions/mean_length": 708.9810791015625, "completions/mean_terminated_length": 663.00341796875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 14.19591836734694, "grad_norm": 0.12982113659381866, "learning_rate": 1e-06, "loss": 0.027, "num_tokens": 1000033517.0, "reward": 0.7176339626312256, "reward_std": 0.10028737038373947, "rewards/simpleverify_reward/mean": 0.7176339030265808, "rewards/simpleverify_reward/std": 0.4504019320011139, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2720.0, "completions/mean_length": 619.5223388671875, "completions/mean_terminated_length": 592.1484985351562, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 14.205247813411079, "grad_norm": 0.17033196985721588, "learning_rate": 1e-06, "loss": 0.0278, "num_tokens": 1000677921.0, "reward": 0.7667410969734192, "reward_std": 0.13038787245750427, "rewards/simpleverify_reward/mean": 0.7667410969734192, "rewards/simpleverify_reward/std": 0.4231418967247009, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017857142857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3291.0, "completions/mean_length": 701.1920166015625, "completions/mean_terminated_length": 639.4681396484375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 14.214577259475218, "grad_norm": 0.17213483154773712, "learning_rate": 1e-06, "loss": 0.0408, "num_tokens": 1001391525.0, "reward": 0.7276785969734192, "reward_std": 0.13996821641921997, "rewards/simpleverify_reward/mean": 0.7276785969734192, "rewards/simpleverify_reward/std": 0.4454030692577362, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3482.0, "completions/mean_length": 708.575927734375, "completions/mean_terminated_length": 689.5668334960938, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 14.223906705539358, "grad_norm": 0.16702331602573395, "learning_rate": 1e-06, "loss": 0.0395, "num_tokens": 1002107945.0, "reward": 0.7332589626312256, "reward_std": 0.1378638744354248, "rewards/simpleverify_reward/mean": 0.7332589030265808, "rewards/simpleverify_reward/std": 0.4425029158592224, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2484.0, "completions/mean_length": 724.6116333007812, "completions/mean_terminated_length": 698.0652465820312, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 14.2332361516035, "grad_norm": 0.15681934356689453, "learning_rate": 1e-06, "loss": 0.0237, "num_tokens": 1002845493.0, "reward": 0.6930803656578064, "reward_std": 0.13470645248889923, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147334575653076, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 664.1796875, "completions/mean_terminated_length": 629.3584594726562, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 14.242565597667639, "grad_norm": 0.17765192687511444, "learning_rate": 1e-06, "loss": 0.0233, "num_tokens": 1003529054.0, "reward": 0.7477678656578064, "reward_std": 0.11190466582775116, "rewards/simpleverify_reward/mean": 0.7477678656578064, "rewards/simpleverify_reward/std": 0.434536337852478, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3269.0, "completions/mean_length": 665.953125, "completions/mean_terminated_length": 638.9448852539062, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 14.251895043731778, "grad_norm": 0.14711976051330566, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 1004226628.0, "reward": 0.7332589626312256, "reward_std": 0.10408420115709305, "rewards/simpleverify_reward/mean": 0.7332589030265808, "rewards/simpleverify_reward/std": 0.4425029158592224, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3269.0, "completions/mean_length": 648.2176513671875, "completions/mean_terminated_length": 628.8698120117188, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 14.261224489795918, "grad_norm": 0.17210184037685394, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 1004897671.0, "reward": 0.6986607313156128, "reward_std": 0.1363222450017929, "rewards/simpleverify_reward/mean": 0.6986607313156128, "rewards/simpleverify_reward/std": 0.4590960144996643, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3136.0, "completions/mean_length": 685.1763916015625, "completions/mean_terminated_length": 677.5458374023438, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 14.270553935860057, "grad_norm": 0.17669369280338287, "learning_rate": 1e-06, "loss": 0.0343, "num_tokens": 1005609405.0, "reward": 0.7031250596046448, "reward_std": 0.1537230759859085, "rewards/simpleverify_reward/mean": 0.703125, "rewards/simpleverify_reward/std": 0.4571361541748047, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016741071428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3749.0, "completions/mean_length": 813.8047485351562, "completions/mean_terminated_length": 757.9216918945312, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 14.279883381924199, "grad_norm": 0.14803270995616913, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 1006431286.0, "reward": 0.609375, "reward_std": 0.1471807062625885, "rewards/simpleverify_reward/mean": 0.609375, "rewards/simpleverify_reward/std": 0.48816296458244324, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3135.0, "completions/mean_length": 753.2924194335938, "completions/mean_terminated_length": 734.5342407226562, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 14.289212827988338, "grad_norm": 0.20494695007801056, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 1007186956.0, "reward": 0.6964285969734192, "reward_std": 0.14368836581707, "rewards/simpleverify_reward/mean": 0.6964285969734192, "rewards/simpleverify_reward/std": 0.4600565731525421, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2873.0, "completions/mean_length": 601.0658569335938, "completions/mean_terminated_length": 581.4534301757812, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 14.298542274052478, "grad_norm": 0.1623178869485855, "learning_rate": 1e-06, "loss": 0.0217, "num_tokens": 1007798079.0, "reward": 0.8046875596046448, "reward_std": 0.1072402223944664, "rewards/simpleverify_reward/mean": 0.8046875, "rewards/simpleverify_reward/std": 0.39666250348091125, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3829.0, "completions/mean_length": 691.3080444335938, "completions/mean_terminated_length": 648.9898681640625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 14.307871720116617, "grad_norm": 0.16939790546894073, "learning_rate": 1e-06, "loss": 0.0244, "num_tokens": 1008504995.0, "reward": 0.7254464626312256, "reward_std": 0.11355297267436981, "rewards/simpleverify_reward/mean": 0.7254464030265808, "rewards/simpleverify_reward/std": 0.4465382993221283, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3797.0, "completions/mean_length": 656.1171875, "completions/mean_terminated_length": 636.813720703125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 14.317201166180759, "grad_norm": 0.15664558112621307, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 1009186220.0, "reward": 0.7008928656578064, "reward_std": 0.12933479249477386, "rewards/simpleverify_reward/mean": 0.7008928656578064, "rewards/simpleverify_reward/std": 0.458122581243515, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3328.0, "completions/mean_length": 614.6763916015625, "completions/mean_terminated_length": 602.98095703125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 14.326530612244898, "grad_norm": 0.14163699746131897, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 1009817034.0, "reward": 0.7265625596046448, "reward_std": 0.10059047490358353, "rewards/simpleverify_reward/mean": 0.7265625, "rewards/simpleverify_reward/std": 0.4459724426269531, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3830.0, "completions/mean_length": 721.1339721679688, "completions/mean_terminated_length": 706.0000610351562, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 14.335860058309038, "grad_norm": 0.16106384992599487, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 1010551514.0, "reward": 0.7265625596046448, "reward_std": 0.1238136887550354, "rewards/simpleverify_reward/mean": 0.7265625, "rewards/simpleverify_reward/std": 0.4459724426269531, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.011160714285714302, "completions/max_length": 4096.0, "completions/max_terminated_length": 3156.0, "completions/mean_length": 707.4620971679688, "completions/mean_terminated_length": 669.2167358398438, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 14.345189504373177, "grad_norm": 0.178434818983078, "learning_rate": 1e-06, "loss": 0.0431, "num_tokens": 1011276520.0, "reward": 0.6830357313156128, "reward_std": 0.14304755628108978, "rewards/simpleverify_reward/mean": 0.6830357313156128, "rewards/simpleverify_reward/std": 0.46555325388908386, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2670.0, "completions/mean_length": 651.0379638671875, "completions/mean_terminated_length": 643.3311157226562, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 14.354518950437317, "grad_norm": 0.17592141032218933, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 1011942786.0, "reward": 0.7633928656578064, "reward_std": 0.12798184156417847, "rewards/simpleverify_reward/mean": 0.7633928656578064, "rewards/simpleverify_reward/std": 0.42523646354675293, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3546.0, "completions/mean_length": 706.9475708007812, "completions/mean_terminated_length": 699.3657836914062, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 14.363848396501458, "grad_norm": 0.13209927082061768, "learning_rate": 1e-06, "loss": 0.0132, "num_tokens": 1012675443.0, "reward": 0.6383928656578064, "reward_std": 0.1040068119764328, "rewards/simpleverify_reward/mean": 0.6383928656578064, "rewards/simpleverify_reward/std": 0.4807341694831848, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3790.0, "completions/mean_length": 687.232177734375, "completions/mean_terminated_length": 660.3914794921875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 14.373177842565598, "grad_norm": 0.16038745641708374, "learning_rate": 1e-06, "loss": 0.0196, "num_tokens": 1013371651.0, "reward": 0.7131696939468384, "reward_std": 0.1134783998131752, "rewards/simpleverify_reward/mean": 0.7131696343421936, "rewards/simpleverify_reward/std": 0.4525342881679535, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3298.0, "completions/mean_length": 655.3828125, "completions/mean_terminated_length": 647.6856689453125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 14.382507288629737, "grad_norm": 0.13692831993103027, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 1014043450.0, "reward": 0.7142857313156128, "reward_std": 0.09664209932088852, "rewards/simpleverify_reward/mean": 0.7142857313156128, "rewards/simpleverify_reward/std": 0.4520062506198883, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 609.0279541015625, "completions/mean_terminated_length": 577.61376953125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 14.391836734693877, "grad_norm": 0.16954950988292694, "learning_rate": 1e-06, "loss": 0.0465, "num_tokens": 1014671739.0, "reward": 0.7600446939468384, "reward_std": 0.12276241183280945, "rewards/simpleverify_reward/mean": 0.7600446343421936, "rewards/simpleverify_reward/std": 0.42729446291923523, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3840.0, "completions/mean_length": 730.325927734375, "completions/mean_terminated_length": 711.4388427734375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 14.401166180758018, "grad_norm": 0.17702612280845642, "learning_rate": 1e-06, "loss": 0.0266, "num_tokens": 1015421063.0, "reward": 0.7098214626312256, "reward_std": 0.12538963556289673, "rewards/simpleverify_reward/mean": 0.7098214030265808, "rewards/simpleverify_reward/std": 0.454098105430603, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3391.0, "completions/mean_length": 693.7611694335938, "completions/mean_terminated_length": 659.2401123046875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 14.410495626822158, "grad_norm": 0.16673922538757324, "learning_rate": 1e-06, "loss": 0.0344, "num_tokens": 1016123353.0, "reward": 0.7611607313156128, "reward_std": 0.14594396948814392, "rewards/simpleverify_reward/mean": 0.7611607313156128, "rewards/simpleverify_reward/std": 0.4266124963760376, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2908.0, "completions/mean_length": 649.7991333007812, "completions/mean_terminated_length": 645.9486083984375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 14.419825072886297, "grad_norm": 0.15671484172344208, "learning_rate": 1e-06, "loss": 0.0157, "num_tokens": 1016787237.0, "reward": 0.7645089626312256, "reward_std": 0.1055484339594841, "rewards/simpleverify_reward/mean": 0.7645089030265808, "rewards/simpleverify_reward/std": 0.42454230785369873, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 669.622802734375, "completions/mean_terminated_length": 650.3950805664062, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 14.429154518950437, "grad_norm": 0.15034587681293488, "learning_rate": 1e-06, "loss": 0.0185, "num_tokens": 1017489347.0, "reward": 0.7299107313156128, "reward_std": 0.12256740033626556, "rewards/simpleverify_reward/mean": 0.7299107313156128, "rewards/simpleverify_reward/std": 0.44425368309020996, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 4057.0, "completions/mean_length": 699.2042846679688, "completions/mean_terminated_length": 664.7384033203125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 14.438483965014576, "grad_norm": 0.14881126582622528, "learning_rate": 1e-06, "loss": 0.0316, "num_tokens": 1018204898.0, "reward": 0.7165178656578064, "reward_std": 0.09938512742519379, "rewards/simpleverify_reward/mean": 0.7165178656578064, "rewards/simpleverify_reward/std": 0.4509401023387909, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3269.0, "completions/mean_length": 708.271240234375, "completions/mean_terminated_length": 677.7511596679688, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 14.447813411078718, "grad_norm": 0.1549883931875229, "learning_rate": 1e-06, "loss": 0.0244, "num_tokens": 1018922389.0, "reward": 0.723214328289032, "reward_std": 0.12189479172229767, "rewards/simpleverify_reward/mean": 0.7232142686843872, "rewards/simpleverify_reward/std": 0.44765952229499817, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3717.0, "completions/mean_length": 637.9765625, "completions/mean_terminated_length": 594.9954833984375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 14.457142857142857, "grad_norm": 0.14158619940280914, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 1019588088.0, "reward": 0.7020089626312256, "reward_std": 0.08003361523151398, "rewards/simpleverify_reward/mean": 0.7020089030265808, "rewards/simpleverify_reward/std": 0.45763099193573, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3668.0, "completions/mean_length": 722.7020263671875, "completions/mean_terminated_length": 692.3119506835938, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 14.466472303206997, "grad_norm": 0.12094037979841232, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 1020338085.0, "reward": 0.6986607313156128, "reward_std": 0.09520993381738663, "rewards/simpleverify_reward/mean": 0.6986607313156128, "rewards/simpleverify_reward/std": 0.4590960144996643, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 673.375, "completions/mean_terminated_length": 642.54052734375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 14.475801749271136, "grad_norm": 0.19650587439537048, "learning_rate": 1e-06, "loss": 0.0455, "num_tokens": 1021022709.0, "reward": 0.7321428656578064, "reward_std": 0.17122988402843475, "rewards/simpleverify_reward/mean": 0.7321428656578064, "rewards/simpleverify_reward/std": 0.4430900514125824, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2263.0, "completions/max_terminated_length": 2263.0, "completions/mean_length": 628.7332763671875, "completions/mean_terminated_length": 628.7332763671875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 14.485131195335278, "grad_norm": 0.14419560134410858, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 1021685134.0, "reward": 0.6975446939468384, "reward_std": 0.11591945588588715, "rewards/simpleverify_reward/mean": 0.6975446343421936, "rewards/simpleverify_reward/std": 0.45957788825035095, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3159.0, "completions/mean_length": 676.2410888671875, "completions/mean_terminated_length": 633.735595703125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 14.494460641399417, "grad_norm": 0.17971345782279968, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 1022376190.0, "reward": 0.723214328289032, "reward_std": 0.1391419768333435, "rewards/simpleverify_reward/mean": 0.7232142686843872, "rewards/simpleverify_reward/std": 0.44765952229499817, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 2839.0, "completions/mean_length": 679.5881958007812, "completions/mean_terminated_length": 644.9232788085938, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 14.503790087463557, "grad_norm": 0.14466272294521332, "learning_rate": 1e-06, "loss": 0.0178, "num_tokens": 1023079117.0, "reward": 0.6886160969734192, "reward_std": 0.10028737038373947, "rewards/simpleverify_reward/mean": 0.6886160969734192, "rewards/simpleverify_reward/std": 0.46331802010536194, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3217.0, "completions/mean_length": 694.943115234375, "completions/mean_terminated_length": 672.0145874023438, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 14.513119533527696, "grad_norm": 0.14066444337368011, "learning_rate": 1e-06, "loss": 0.0138, "num_tokens": 1023790450.0, "reward": 0.7500000596046448, "reward_std": 0.10058976709842682, "rewards/simpleverify_reward/mean": 0.75, "rewards/simpleverify_reward/std": 0.43325456976890564, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3402.0, "completions/mean_length": 677.9520263671875, "completions/mean_terminated_length": 658.7710571289062, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 14.522448979591836, "grad_norm": 0.17435820400714874, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 1024488359.0, "reward": 0.7265625596046448, "reward_std": 0.12621267139911652, "rewards/simpleverify_reward/mean": 0.7265625, "rewards/simpleverify_reward/std": 0.4459724426269531, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2532.0, "completions/mean_length": 679.7935791015625, "completions/mean_terminated_length": 668.3169555664062, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 14.531778425655977, "grad_norm": 0.13622109591960907, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 1025195982.0, "reward": 0.707589328289032, "reward_std": 0.08860477805137634, "rewards/simpleverify_reward/mean": 0.7075892686843872, "rewards/simpleverify_reward/std": 0.45512402057647705, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3504.0, "completions/mean_length": 729.8035888671875, "completions/mean_terminated_length": 718.4949951171875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 14.541107871720117, "grad_norm": 0.1537478119134903, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 1025945230.0, "reward": 0.684151828289032, "reward_std": 0.12125609070062637, "rewards/simpleverify_reward/mean": 0.6841517686843872, "rewards/simpleverify_reward/std": 0.4651124179363251, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 627.4754638671875, "completions/mean_terminated_length": 623.5999755859375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 14.550437317784256, "grad_norm": 0.1822907030582428, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 1026593896.0, "reward": 0.7667410969734192, "reward_std": 0.13478313386440277, "rewards/simpleverify_reward/mean": 0.7667410969734192, "rewards/simpleverify_reward/std": 0.4231418967247009, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3675.0, "completions/mean_length": 615.904052734375, "completions/mean_terminated_length": 584.5518188476562, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 14.559766763848396, "grad_norm": 0.1439361274242401, "learning_rate": 1e-06, "loss": 0.0233, "num_tokens": 1027225154.0, "reward": 0.7868303656578064, "reward_std": 0.08875633031129837, "rewards/simpleverify_reward/mean": 0.7868303656578064, "rewards/simpleverify_reward/std": 0.4097752273082733, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 4027.0, "completions/mean_length": 627.521240234375, "completions/mean_terminated_length": 596.273681640625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 14.569096209912537, "grad_norm": 0.1524398773908615, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 1027877773.0, "reward": 0.7142857313156128, "reward_std": 0.09517784416675568, "rewards/simpleverify_reward/mean": 0.7142857313156128, "rewards/simpleverify_reward/std": 0.4520062506198883, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2789.0, "completions/mean_length": 746.2210083007812, "completions/mean_terminated_length": 716.0427856445312, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 14.578425655976677, "grad_norm": 0.194468691945076, "learning_rate": 1e-06, "loss": 0.0254, "num_tokens": 1028634555.0, "reward": 0.6707589626312256, "reward_std": 0.12715516984462738, "rewards/simpleverify_reward/mean": 0.6707589030265808, "rewards/simpleverify_reward/std": 0.4702001214027405, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3883.0, "completions/mean_length": 620.739990234375, "completions/mean_terminated_length": 593.375732421875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 14.587755102040816, "grad_norm": 0.1550123244524002, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 1029278058.0, "reward": 0.7566964626312256, "reward_std": 0.11321555078029633, "rewards/simpleverify_reward/mean": 0.7566964030265808, "rewards/simpleverify_reward/std": 0.42931652069091797, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2722.0, "completions/mean_length": 639.2890625, "completions/mean_terminated_length": 627.6763916015625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 14.597084548104956, "grad_norm": 0.152225062251091, "learning_rate": 1e-06, "loss": 0.023, "num_tokens": 1029944837.0, "reward": 0.7198660969734192, "reward_std": 0.1242612823843956, "rewards/simpleverify_reward/mean": 0.7198660969734192, "rewards/simpleverify_reward/std": 0.44931527972221375, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3170.0, "completions/mean_length": 649.2232666015625, "completions/mean_terminated_length": 625.9865112304688, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 14.606413994169095, "grad_norm": 0.1583629548549652, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 1030611861.0, "reward": 0.7500000596046448, "reward_std": 0.12249071896076202, "rewards/simpleverify_reward/mean": 0.75, "rewards/simpleverify_reward/std": 0.43325456976890564, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 692.1908569335938, "completions/mean_terminated_length": 673.08984375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 14.615743440233237, "grad_norm": 0.16240210831165314, "learning_rate": 1e-06, "loss": 0.0242, "num_tokens": 1031321520.0, "reward": 0.7243303656578064, "reward_std": 0.11039765179157257, "rewards/simpleverify_reward/mean": 0.7243303656578064, "rewards/simpleverify_reward/std": 0.4471006691455841, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 639.671875, "completions/mean_terminated_length": 628.0604858398438, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 14.625072886297376, "grad_norm": 0.18450625240802765, "learning_rate": 1e-06, "loss": 0.022, "num_tokens": 1031983226.0, "reward": 0.7678571939468384, "reward_std": 0.11953012645244598, "rewards/simpleverify_reward/mean": 0.7678571343421936, "rewards/simpleverify_reward/std": 0.422435462474823, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012276785714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3378.0, "completions/mean_length": 676.0145263671875, "completions/mean_terminated_length": 633.5062255859375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 14.634402332361516, "grad_norm": 0.13682182133197784, "learning_rate": 1e-06, "loss": 0.0104, "num_tokens": 1032673783.0, "reward": 0.7433035969734192, "reward_std": 0.10092606395483017, "rewards/simpleverify_reward/mean": 0.7433035969734192, "rewards/simpleverify_reward/std": 0.43705445528030396, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 577.234375, "completions/mean_terminated_length": 561.4552001953125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 14.643731778425655, "grad_norm": 0.19240865111351013, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 1033287057.0, "reward": 0.7533482313156128, "reward_std": 0.1180986762046814, "rewards/simpleverify_reward/mean": 0.7533482313156128, "rewards/simpleverify_reward/std": 0.4313030242919922, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3499.0, "completions/mean_length": 694.3482666015625, "completions/mean_terminated_length": 671.4157104492188, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 14.653061224489797, "grad_norm": 0.18752305209636688, "learning_rate": 1e-06, "loss": 0.0241, "num_tokens": 1033991801.0, "reward": 0.7421875596046448, "reward_std": 0.12756815552711487, "rewards/simpleverify_reward/mean": 0.7421875, "rewards/simpleverify_reward/std": 0.43767455220222473, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014508928571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3851.0, "completions/mean_length": 738.5145263671875, "completions/mean_terminated_length": 689.0838012695312, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 14.662390670553936, "grad_norm": 0.19380046427249908, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 1034754486.0, "reward": 0.684151828289032, "reward_std": 0.13932812213897705, "rewards/simpleverify_reward/mean": 0.6841517686843872, "rewards/simpleverify_reward/std": 0.4651124179363251, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2877.0, "completions/mean_length": 659.8772583007812, "completions/mean_terminated_length": 640.5948486328125, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 14.671720116618076, "grad_norm": 0.15627489984035492, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 1035432816.0, "reward": 0.7109375596046448, "reward_std": 0.11547186225652695, "rewards/simpleverify_reward/mean": 0.7109375, "rewards/simpleverify_reward/std": 0.45358020067214966, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 3586.0, "completions/mean_length": 670.0167846679688, "completions/mean_terminated_length": 639.1520385742188, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 14.681049562682215, "grad_norm": 0.17642037570476532, "learning_rate": 1e-06, "loss": 0.0274, "num_tokens": 1036125439.0, "reward": 0.7299107313156128, "reward_std": 0.11681918054819107, "rewards/simpleverify_reward/mean": 0.7299107313156128, "rewards/simpleverify_reward/std": 0.44425368309020996, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 619.5971069335938, "completions/mean_terminated_length": 611.8198852539062, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 14.690379008746355, "grad_norm": 0.15956062078475952, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 1036778094.0, "reward": 0.7187500596046448, "reward_std": 0.11276161670684814, "rewards/simpleverify_reward/mean": 0.71875, "rewards/simpleverify_reward/std": 0.4498603343963623, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3327.0, "completions/mean_length": 757.1707763671875, "completions/mean_terminated_length": 734.6618041992188, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 14.699708454810496, "grad_norm": 0.1903618425130844, "learning_rate": 1e-06, "loss": 0.0194, "num_tokens": 1037547759.0, "reward": 0.6127232313156128, "reward_std": 0.1573658436536789, "rewards/simpleverify_reward/mean": 0.6127232313156128, "rewards/simpleverify_reward/std": 0.4873998463153839, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2628.0, "completions/mean_length": 637.1027221679688, "completions/mean_terminated_length": 609.8673095703125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 14.709037900874636, "grad_norm": 0.17834949493408203, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 1038206339.0, "reward": 0.7522321939468384, "reward_std": 0.13598595559597015, "rewards/simpleverify_reward/mean": 0.7522321343421936, "rewards/simpleverify_reward/std": 0.4319573640823364, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2462.0, "completions/mean_length": 652.4855346679688, "completions/mean_terminated_length": 625.3712158203125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 14.718367346938775, "grad_norm": 0.20041854679584503, "learning_rate": 1e-06, "loss": 0.0257, "num_tokens": 1038872542.0, "reward": 0.7209821939468384, "reward_std": 0.14353612065315247, "rewards/simpleverify_reward/mean": 0.7209821343421936, "rewards/simpleverify_reward/std": 0.448766827583313, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2929.0, "completions/mean_length": 683.1819458007812, "completions/mean_terminated_length": 660.1741943359375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 14.727696793002915, "grad_norm": 0.1708071231842041, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 1039591361.0, "reward": 0.6774553656578064, "reward_std": 0.1162225604057312, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3817.0, "completions/mean_length": 656.2355346679688, "completions/mean_terminated_length": 640.810546875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 14.737026239067056, "grad_norm": 0.14733074605464935, "learning_rate": 1e-06, "loss": 0.0196, "num_tokens": 1040278676.0, "reward": 0.731026828289032, "reward_std": 0.12659244239330292, "rewards/simpleverify_reward/mean": 0.7310267686843872, "rewards/simpleverify_reward/std": 0.44367367029190063, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0022321428571429047, "completions/max_length": 4096.0, "completions/max_terminated_length": 3394.0, "completions/mean_length": 594.8783569335938, "completions/mean_terminated_length": 587.0458374023438, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 14.746355685131196, "grad_norm": 0.17726483941078186, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 1040892239.0, "reward": 0.7946428656578064, "reward_std": 0.1212565079331398, "rewards/simpleverify_reward/mean": 0.7946428656578064, "rewards/simpleverify_reward/std": 0.40418797731399536, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.008928571428571397, "completions/max_length": 4096.0, "completions/max_terminated_length": 2929.0, "completions/mean_length": 663.671875, "completions/mean_terminated_length": 632.75, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 14.755685131195335, "grad_norm": 0.15900109708309174, "learning_rate": 1e-06, "loss": 0.04, "num_tokens": 1041581977.0, "reward": 0.754464328289032, "reward_std": 0.12696154415607452, "rewards/simpleverify_reward/mean": 0.7544642686843872, "rewards/simpleverify_reward/std": 0.43064478039741516, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2295.0, "completions/mean_length": 636.8192138671875, "completions/mean_terminated_length": 625.1982421875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 14.765014577259475, "grad_norm": 0.16544391214847565, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 1042243359.0, "reward": 0.731026828289032, "reward_std": 0.12523697316646576, "rewards/simpleverify_reward/mean": 0.7310267686843872, "rewards/simpleverify_reward/std": 0.44367367029190063, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3867.0, "completions/mean_length": 768.0178833007812, "completions/mean_terminated_length": 734.250244140625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 14.774344023323614, "grad_norm": 0.17557606101036072, "learning_rate": 1e-06, "loss": 0.0094, "num_tokens": 1043033039.0, "reward": 0.6305803656578064, "reward_std": 0.14692038297653198, "rewards/simpleverify_reward/mean": 0.6305803656578064, "rewards/simpleverify_reward/std": 0.4829172194004059, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 678.6986694335938, "completions/mean_terminated_length": 655.6607055664062, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 14.783673469387756, "grad_norm": 0.1506243497133255, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 1043733521.0, "reward": 0.6830357313156128, "reward_std": 0.10077522695064545, "rewards/simpleverify_reward/mean": 0.6830357313156128, "rewards/simpleverify_reward/std": 0.46555325388908386, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3268.0, "completions/mean_length": 635.015625, "completions/mean_terminated_length": 619.4955444335938, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 14.793002915451895, "grad_norm": 0.2674004137516022, "learning_rate": 1e-06, "loss": 0.0181, "num_tokens": 1044387335.0, "reward": 0.7801339626312256, "reward_std": 0.10235892981290817, "rewards/simpleverify_reward/mean": 0.7801339030265808, "rewards/simpleverify_reward/std": 0.4143870770931244, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3925.0, "completions/mean_length": 690.0670166015625, "completions/mean_terminated_length": 667.1056518554688, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 14.802332361516035, "grad_norm": 0.1685916781425476, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 1045091619.0, "reward": 0.6886160969734192, "reward_std": 0.13579979538917542, "rewards/simpleverify_reward/mean": 0.6886160969734192, "rewards/simpleverify_reward/std": 0.46331802010536194, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2742.0, "completions/mean_length": 724.1250610351562, "completions/mean_terminated_length": 720.3575439453125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 14.811661807580174, "grad_norm": 0.13679945468902588, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 1045836427.0, "reward": 0.7020089626312256, "reward_std": 0.08762837946414948, "rewards/simpleverify_reward/mean": 0.7020089030265808, "rewards/simpleverify_reward/std": 0.45763099193573, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 4027.0, "completions/mean_length": 696.7489013671875, "completions/mean_terminated_length": 673.8325805664062, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 14.820991253644316, "grad_norm": 0.16976775228977203, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 1046553370.0, "reward": 0.7198660969734192, "reward_std": 0.09175898879766464, "rewards/simpleverify_reward/mean": 0.7198660969734192, "rewards/simpleverify_reward/std": 0.44931527972221375, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3449.0, "completions/mean_length": 754.4051513671875, "completions/mean_terminated_length": 728.0933837890625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 14.830320699708455, "grad_norm": 0.16923169791698456, "learning_rate": 1e-06, "loss": 0.0106, "num_tokens": 1047314493.0, "reward": 0.6863839626312256, "reward_std": 0.16101153194904327, "rewards/simpleverify_reward/mean": 0.6863839030265808, "rewards/simpleverify_reward/std": 0.46422144770622253, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2559.0, "completions/mean_length": 628.7600708007812, "completions/mean_terminated_length": 609.3030395507812, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 14.839650145772595, "grad_norm": 0.23130422830581665, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 1047970406.0, "reward": 0.7500000596046448, "reward_std": 0.11355186253786087, "rewards/simpleverify_reward/mean": 0.75, "rewards/simpleverify_reward/std": 0.43325456976890564, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0011160714285713969, "completions/max_length": 4096.0, "completions/max_terminated_length": 2513.0, "completions/mean_length": 733.435302734375, "completions/mean_terminated_length": 729.6781616210938, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 14.848979591836734, "grad_norm": 0.18913036584854126, "learning_rate": 1e-06, "loss": 0.0251, "num_tokens": 1048716372.0, "reward": 0.6651785969734192, "reward_std": 0.14792746305465698, "rewards/simpleverify_reward/mean": 0.6651785969734192, "rewards/simpleverify_reward/std": 0.47219160199165344, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013392857142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3921.0, "completions/mean_length": 761.6875610351562, "completions/mean_terminated_length": 716.4253540039062, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 14.858309037900874, "grad_norm": 0.15965145826339722, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 1049491628.0, "reward": 0.6640625, "reward_std": 0.1304224729537964, "rewards/simpleverify_reward/mean": 0.6640625, "rewards/simpleverify_reward/std": 0.4725809693336487, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2689.0, "completions/mean_length": 669.6261596679688, "completions/mean_terminated_length": 650.3984375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 14.867638483965015, "grad_norm": 0.17191046476364136, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 1050179829.0, "reward": 0.7421875596046448, "reward_std": 0.12414999306201935, "rewards/simpleverify_reward/mean": 0.7421875, "rewards/simpleverify_reward/std": 0.43767455220222473, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 2243.0, "completions/mean_length": 611.7299194335938, "completions/mean_terminated_length": 600.024658203125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 14.876967930029155, "grad_norm": 0.17882762849330902, "learning_rate": 1e-06, "loss": 0.0134, "num_tokens": 1050815771.0, "reward": 0.793526828289032, "reward_std": 0.11670970916748047, "rewards/simpleverify_reward/mean": 0.7935267686843872, "rewards/simpleverify_reward/std": 0.40500015020370483, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3313.0, "completions/mean_length": 740.2678833007812, "completions/mean_terminated_length": 706.2186889648438, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 14.886297376093294, "grad_norm": 0.18628795444965363, "learning_rate": 1e-06, "loss": 0.034, "num_tokens": 1051575283.0, "reward": 0.6774553656578064, "reward_std": 0.149286150932312, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0033482142857143016, "completions/max_length": 4096.0, "completions/max_terminated_length": 3828.0, "completions/mean_length": 674.646240234375, "completions/mean_terminated_length": 663.15234375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 14.895626822157434, "grad_norm": 0.15980187058448792, "learning_rate": 1e-06, "loss": 0.0347, "num_tokens": 1052272046.0, "reward": 0.6774553656578064, "reward_std": 0.10325934737920761, "rewards/simpleverify_reward/mean": 0.6774553656578064, "rewards/simpleverify_reward/std": 0.4677111804485321, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3791.0, "completions/mean_length": 710.9063110351562, "completions/mean_terminated_length": 688.0853881835938, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 14.904956268221575, "grad_norm": 0.16355322301387787, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 1052999650.0, "reward": 0.6908482313156128, "reward_std": 0.11562410742044449, "rewards/simpleverify_reward/mean": 0.6908482313156128, "rewards/simpleverify_reward/std": 0.46240198612213135, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.004464285714285698, "completions/max_length": 4096.0, "completions/max_terminated_length": 3274.0, "completions/mean_length": 702.6172485351562, "completions/mean_terminated_length": 687.4002685546875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 14.914285714285715, "grad_norm": 0.1838567554950714, "learning_rate": 1e-06, "loss": 0.0147, "num_tokens": 1053714891.0, "reward": 0.7131696939468384, "reward_std": 0.13951106369495392, "rewards/simpleverify_reward/mean": 0.7131696343421936, "rewards/simpleverify_reward/std": 0.4525342881679535, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 2903.0, "completions/mean_length": 647.1953125, "completions/mean_terminated_length": 627.841796875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 14.923615160349854, "grad_norm": 0.15782670676708221, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 1054376458.0, "reward": 0.7656250596046448, "reward_std": 0.08698716014623642, "rewards/simpleverify_reward/mean": 0.765625, "rewards/simpleverify_reward/std": 0.4238441288471222, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005580357142857095, "completions/max_length": 4096.0, "completions/max_terminated_length": 3986.0, "completions/mean_length": 697.3750610351562, "completions/mean_terminated_length": 678.3030395507812, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 14.932944606413994, "grad_norm": 0.1708441972732544, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 1055085290.0, "reward": 0.7243303656578064, "reward_std": 0.13023702800273895, "rewards/simpleverify_reward/mean": 0.7243303656578064, "rewards/simpleverify_reward/std": 0.4471006691455841, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010044642857142905, "completions/max_length": 4096.0, "completions/max_terminated_length": 3388.0, "completions/mean_length": 698.6082763671875, "completions/mean_terminated_length": 664.1364135742188, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 14.942274052478133, "grad_norm": 0.15127886831760406, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 1055803715.0, "reward": 0.6930803656578064, "reward_std": 0.1169707253575325, "rewards/simpleverify_reward/mean": 0.6930803656578064, "rewards/simpleverify_reward/std": 0.46147337555885315, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.006696428571428603, "completions/max_length": 4096.0, "completions/max_terminated_length": 3667.0, "completions/mean_length": 780.1339721679688, "completions/mean_terminated_length": 757.77978515625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 14.951603498542275, "grad_norm": 0.16772863268852234, "learning_rate": 1e-06, "loss": 0.0196, "num_tokens": 1056603115.0, "reward": 0.6316964626312256, "reward_std": 0.11757732927799225, "rewards/simpleverify_reward/mean": 0.6316964030265808, "rewards/simpleverify_reward/std": 0.4826137125492096, "step": 1600 }, { "epoch": 14.951603498542275, "step": 1600, "total_flos": 0.0, "train_loss": 0.020633934048528316, "train_runtime": 141634.4702, "train_samples_per_second": 10.122, "train_steps_per_second": 0.011 } ], "logging_steps": 1, "max_steps": 1600, "num_input_tokens_seen": 1056603115, "num_train_epochs": 15, "save_steps": 160, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }