diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6577 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.998452810727179, + "eval_steps": 500, + "global_step": 242, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2295.0, + "completions/max_terminated_length": 2295.0, + "completions/mean_length": 481.84600830078125, + "completions/mean_terminated_length": 481.84600830078125, + "completions/min_length": 3.0, + "completions/min_terminated_length": 3.0, + "epoch": 0.0041258380608561115, + "grad_norm": 0.24569128453731537, + "kl": 0.0, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 482686.0, + "reward": 0.1160714328289032, + "reward_std": 0.20019538700580597, + "rewards/code_format_reward/mean": 0.046875, + "rewards/code_format_reward/std": 0.21160738170146942, + "rewards/curriculum_aware_reward_fn/mean": 0.0691964253783226, + "rewards/curriculum_aware_reward_fn/std": 0.2627292275428772, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1400.0, + "completions/max_terminated_length": 1400.0, + "completions/mean_length": 481.9620666503906, + "completions/mean_terminated_length": 481.9620666503906, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.008251676121712223, + "grad_norm": 0.24532559514045715, + "kl": 0.00029015541076660156, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 947945.0, + "reward": 0.122767873108387, + "reward_std": 0.20709191262722015, + "rewards/code_format_reward/mean": 0.0446428582072258, + "rewards/code_format_reward/std": 0.2067493349313736, + "rewards/curriculum_aware_reward_fn/mean": 0.078125, + "rewards/curriculum_aware_reward_fn/std": 0.26866820454597473, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1337.0, + "completions/max_terminated_length": 1337.0, + "completions/mean_length": 478.7745666503906, + "completions/mean_terminated_length": 478.7745666503906, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.012377514182568335, + "grad_norm": 0.303946316242218, + "kl": 0.0003151893615722656, + "learning_rate": 1e-06, + "loss": 0.02, + "num_tokens": 1414835.0, + "reward": 0.1473214328289032, + "reward_std": 0.270842045545578, + "rewards/code_format_reward/mean": 0.0758928582072258, + "rewards/code_format_reward/std": 0.265122652053833, + "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774, + "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1680.0, + "completions/max_terminated_length": 1680.0, + "completions/mean_length": 491.4375305175781, + "completions/mean_terminated_length": 491.4375305175781, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.016503352243424446, + "grad_norm": 0.27564534544944763, + "kl": 0.0003771781921386719, + "learning_rate": 1e-06, + "loss": 0.013, + "num_tokens": 1915020.0, + "reward": 0.1406250298023224, + "reward_std": 0.2626669406890869, + "rewards/code_format_reward/mean": 0.0647321417927742, + "rewards/code_format_reward/std": 0.24632768332958221, + "rewards/curriculum_aware_reward_fn/mean": 0.0758928582072258, + "rewards/curriculum_aware_reward_fn/std": 0.265122652053833, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1558.0, + "completions/max_terminated_length": 1558.0, + "completions/mean_length": 510.8504638671875, + "completions/mean_terminated_length": 510.8504638671875, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 0.020629190304280558, + "grad_norm": 0.3501759469509125, + "kl": 0.0005273818969726562, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 2422824.0, + "reward": 0.2343750149011612, + "reward_std": 0.3528774082660675, + "rewards/code_format_reward/mean": 0.1361607164144516, + "rewards/code_format_reward/std": 0.34334251284599304, + "rewards/curriculum_aware_reward_fn/mean": 0.0982142835855484, + "rewards/curriculum_aware_reward_fn/std": 0.29793688654899597, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1621.0, + "completions/mean_length": 467.83038330078125, + "completions/mean_terminated_length": 459.7136535644531, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 0.02475502836513667, + "grad_norm": 0.41125398874282837, + "kl": 0.0011577606201171875, + "learning_rate": 1e-06, + "loss": 0.0271, + "num_tokens": 2907715.0, + "reward": 0.314732164144516, + "reward_std": 0.4204106032848358, + "rewards/code_format_reward/mean": 0.203125, + "rewards/code_format_reward/std": 0.4027745723724365, + "rewards/curriculum_aware_reward_fn/mean": 0.1116071417927742, + "rewards/curriculum_aware_reward_fn/std": 0.31523454189300537, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1866.0, + "completions/max_terminated_length": 1866.0, + "completions/mean_length": 464.36163330078125, + "completions/mean_terminated_length": 464.36163330078125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.02888086642599278, + "grad_norm": 0.4432518482208252, + "kl": 0.0016002655029296875, + "learning_rate": 1e-06, + "loss": 0.0249, + "num_tokens": 3396491.0, + "reward": 0.3549107611179352, + "reward_std": 0.4484630227088928, + "rewards/code_format_reward/mean": 0.2700892984867096, + "rewards/code_format_reward/std": 0.444502055644989, + "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226, + "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2789.0, + "completions/max_terminated_length": 2789.0, + "completions/mean_length": 434.5513610839844, + "completions/mean_terminated_length": 434.5513610839844, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.03300670448684889, + "grad_norm": 0.5083041191101074, + "kl": 0.0022182464599609375, + "learning_rate": 1e-06, + "loss": 0.0264, + "num_tokens": 3854503.0, + "reward": 0.4285714626312256, + "reward_std": 0.5326574444770813, + "rewards/code_format_reward/mean": 0.3571428656578064, + "rewards/code_format_reward/std": 0.47969308495521545, + "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774, + "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1266.0, + "completions/max_terminated_length": 1266.0, + "completions/mean_length": 458.1339416503906, + "completions/mean_terminated_length": 458.1339416503906, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.037132542547705004, + "grad_norm": 0.470951110124588, + "kl": 0.0035037994384765625, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 4321246.0, + "reward": 0.5022321939468384, + "reward_std": 0.5391286015510559, + "rewards/code_format_reward/mean": 0.4151785671710968, + "rewards/code_format_reward/std": 0.49330368638038635, + "rewards/curriculum_aware_reward_fn/mean": 0.0870535746216774, + "rewards/curriculum_aware_reward_fn/std": 0.2822286784648895, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1260.0, + "completions/max_terminated_length": 1260.0, + "completions/mean_length": 413.63616943359375, + "completions/mean_terminated_length": 413.63616943359375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.041258380608561115, + "grad_norm": 0.43107807636260986, + "kl": 0.0074920654296875, + "learning_rate": 1e-06, + "loss": 0.0132, + "num_tokens": 4777000.0, + "reward": 0.7700893878936768, + "reward_std": 0.5293206572532654, + "rewards/code_format_reward/mean": 0.6830357313156128, + "rewards/code_format_reward/std": 0.4658135175704956, + "rewards/curriculum_aware_reward_fn/mean": 0.0870535746216774, + "rewards/curriculum_aware_reward_fn/std": 0.2822286784648895, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1706.0, + "completions/mean_length": 430.3906555175781, + "completions/mean_terminated_length": 422.1901550292969, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.04538421866941723, + "grad_norm": 0.4257507026195526, + "kl": 0.00693511962890625, + "learning_rate": 1e-06, + "loss": 0.0281, + "num_tokens": 5234863.0, + "reward": 0.8683035969734192, + "reward_std": 0.4879590570926666, + "rewards/code_format_reward/mean": 0.7477678656578064, + "rewards/code_format_reward/std": 0.4347792863845825, + "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516, + "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3014.0, + "completions/mean_length": 438.3750305175781, + "completions/mean_terminated_length": 430.1923828125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.04951005673027334, + "grad_norm": 0.3997235596179962, + "kl": 0.007152557373046875, + "learning_rate": 1e-06, + "loss": 0.0256, + "num_tokens": 5699194.0, + "reward": 0.8705357313156128, + "reward_std": 0.39619719982147217, + "rewards/code_format_reward/mean": 0.8102678656578064, + "rewards/code_format_reward/std": 0.39252740144729614, + "rewards/curriculum_aware_reward_fn/mean": 0.0602678582072258, + "rewards/curriculum_aware_reward_fn/std": 0.23824848234653473, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2370.0, + "completions/max_terminated_length": 2370.0, + "completions/mean_length": 409.2701110839844, + "completions/mean_terminated_length": 409.2701110839844, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.05363589479112945, + "grad_norm": 0.39804381132125854, + "kl": 0.00933074951171875, + "learning_rate": 1e-06, + "loss": 0.024, + "num_tokens": 6151046.0, + "reward": 0.930803656578064, + "reward_std": 0.4214654862880707, + "rewards/code_format_reward/mean": 0.8214285969734192, + "rewards/code_format_reward/std": 0.3834212124347687, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.3124580383300781, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1497.0, + "completions/max_terminated_length": 1497.0, + "completions/mean_length": 403.5692138671875, + "completions/mean_terminated_length": 403.5692138671875, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.05776173285198556, + "grad_norm": 0.372781902551651, + "kl": 0.0071258544921875, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 6616948.0, + "reward": 0.9910714626312256, + "reward_std": 0.3275630474090576, + "rewards/code_format_reward/mean": 0.8727678656578064, + "rewards/code_format_reward/std": 0.3336053788661957, + "rewards/curriculum_aware_reward_fn/mean": 0.1183035746216774, + "rewards/curriculum_aware_reward_fn/std": 0.32332828640937805, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1976.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 395.19866943359375, + "completions/mean_terminated_length": 395.19866943359375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.06188757091284167, + "grad_norm": 0.35227906703948975, + "kl": 0.007076263427734375, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 7048573.0, + "reward": 1.0290179252624512, + "reward_std": 0.3685193955898285, + "rewards/code_format_reward/mean": 0.8816964030265808, + "rewards/code_format_reward/std": 0.32332828640937805, + "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1582.0, + "completions/max_terminated_length": 1582.0, + "completions/mean_length": 401.7633972167969, + "completions/mean_terminated_length": 401.7633972167969, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.06601340897369778, + "grad_norm": 0.37148913741111755, + "kl": 0.009387969970703125, + "learning_rate": 1e-06, + "loss": 0.0612, + "num_tokens": 7513155.0, + "reward": 1.0022321939468384, + "reward_std": 0.335994154214859, + "rewards/code_format_reward/mean": 0.90625, + "rewards/code_format_reward/std": 0.2918064594268799, + "rewards/curriculum_aware_reward_fn/mean": 0.0959821417927742, + "rewards/curriculum_aware_reward_fn/std": 0.29489606618881226, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2088.0, + "completions/max_terminated_length": 2088.0, + "completions/mean_length": 400.2812805175781, + "completions/mean_terminated_length": 400.2812805175781, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.07013924703455389, + "grad_norm": 0.3265136480331421, + "kl": 0.01309967041015625, + "learning_rate": 1e-06, + "loss": 0.0591, + "num_tokens": 7951544.0, + "reward": 1.0267857313156128, + "reward_std": 0.2912137508392334, + "rewards/code_format_reward/mean": 0.9375, + "rewards/code_format_reward/std": 0.24233205616474152, + "rewards/curriculum_aware_reward_fn/mean": 0.0892857164144516, + "rewards/curriculum_aware_reward_fn/std": 0.30073946714401245, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1343.0, + "completions/max_terminated_length": 1343.0, + "completions/mean_length": 367.97100830078125, + "completions/mean_terminated_length": 367.97100830078125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.07426508509541001, + "grad_norm": 0.2556539475917816, + "kl": 0.009555816650390625, + "learning_rate": 1e-06, + "loss": 0.0147, + "num_tokens": 8385776.0, + "reward": 1.0982143878936768, + "reward_std": 0.20816494524478912, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1616371124982834, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1695.0, + "completions/max_terminated_length": 1695.0, + "completions/mean_length": 390.2969055175781, + "completions/mean_terminated_length": 390.2969055175781, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.07839092315626611, + "grad_norm": 0.2662300765514374, + "kl": 0.0101318359375, + "learning_rate": 1e-06, + "loss": 0.019, + "num_tokens": 8851108.0, + "reward": 1.0669643878936768, + "reward_std": 0.18616731464862823, + "rewards/code_format_reward/mean": 0.9776785969734192, + "rewards/code_format_reward/std": 0.1478918492794037, + "rewards/curriculum_aware_reward_fn/mean": 0.0892857164144516, + "rewards/curriculum_aware_reward_fn/std": 0.2854744791984558, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2309.0, + "completions/max_terminated_length": 2309.0, + "completions/mean_length": 379.9040222167969, + "completions/mean_terminated_length": 379.9040222167969, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.08251676121712223, + "grad_norm": 0.23065051436424255, + "kl": 0.01053619384765625, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 9290490.0, + "reward": 1.118303656578064, + "reward_std": 0.17734426259994507, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1933.0, + "completions/max_terminated_length": 1933.0, + "completions/mean_length": 356.87054443359375, + "completions/mean_terminated_length": 356.87054443359375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.08664259927797834, + "grad_norm": 0.625492513179779, + "kl": 0.04799652099609375, + "learning_rate": 1e-06, + "loss": 0.0243, + "num_tokens": 9711349.0, + "reward": 1.078125, + "reward_std": 0.15985102951526642, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843171834946, + "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226, + "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1136.0, + "completions/max_terminated_length": 1136.0, + "completions/mean_length": 349.20538330078125, + "completions/mean_terminated_length": 349.20538330078125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.09076843733883445, + "grad_norm": 0.21386803686618805, + "kl": 0.0127105712890625, + "learning_rate": 1e-06, + "loss": 0.018, + "num_tokens": 10124924.0, + "reward": 1.0982143878936768, + "reward_std": 0.16575203835964203, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1026785746216774, + "rewards/curriculum_aware_reward_fn/std": 0.30387791991233826, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1185.0, + "completions/max_terminated_length": 1185.0, + "completions/mean_length": 341.2589416503906, + "completions/mean_terminated_length": 341.2589416503906, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.09489427539969056, + "grad_norm": 0.2108008712530136, + "kl": 0.01441192626953125, + "learning_rate": 1e-06, + "loss": 0.0101, + "num_tokens": 10540357.0, + "reward": 1.078125, + "reward_std": 0.13501358032226562, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843171834946, + "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226, + "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 358.1629638671875, + "completions/mean_terminated_length": 358.1629638671875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.09902011346054668, + "grad_norm": 0.22388455271720886, + "kl": 0.01308441162109375, + "learning_rate": 1e-06, + "loss": 0.0167, + "num_tokens": 10973079.0, + "reward": 1.118303656578064, + "reward_std": 0.15738239884376526, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1227678582072258, + "rewards/curriculum_aware_reward_fn/std": 0.3285374045372009, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1847.0, + "completions/max_terminated_length": 1847.0, + "completions/mean_length": 345.7008972167969, + "completions/mean_terminated_length": 345.7008972167969, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.10314595152140278, + "grad_norm": 0.9551708102226257, + "kl": 0.07257080078125, + "learning_rate": 1e-06, + "loss": 0.0215, + "num_tokens": 11398370.0, + "reward": 1.1294643878936768, + "reward_std": 0.18270127475261688, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1139.0, + "completions/max_terminated_length": 1139.0, + "completions/mean_length": 354.1875305175781, + "completions/mean_terminated_length": 354.1875305175781, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.1072717895822589, + "grad_norm": 0.20303034782409668, + "kl": 0.0153656005859375, + "learning_rate": 1e-06, + "loss": 0.0156, + "num_tokens": 11834365.0, + "reward": 1.102678656578064, + "reward_std": 0.136513814330101, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1026785746216774, + "rewards/curriculum_aware_reward_fn/std": 0.30387789011001587, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1112.0, + "completions/max_terminated_length": 1112.0, + "completions/mean_length": 348.15179443359375, + "completions/mean_terminated_length": 348.15179443359375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.111397627643115, + "grad_norm": 0.23060455918312073, + "kl": 0.01444244384765625, + "learning_rate": 1e-06, + "loss": 0.0246, + "num_tokens": 12265857.0, + "reward": 1.118303656578064, + "reward_std": 0.19625361263751984, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.3377779722213745, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1949.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 341.3035888671875, + "completions/mean_terminated_length": 341.3035888671875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.11552346570397112, + "grad_norm": 0.21146497130393982, + "kl": 0.0152740478515625, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 12689969.0, + "reward": 1.1205358505249023, + "reward_std": 0.14929363131523132, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516, + "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2630.0, + "completions/max_terminated_length": 2630.0, + "completions/mean_length": 342.83929443359375, + "completions/mean_terminated_length": 342.83929443359375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.11964930376482723, + "grad_norm": 0.2256154865026474, + "kl": 0.01830291748046875, + "learning_rate": 1e-06, + "loss": 0.0169, + "num_tokens": 13120551.0, + "reward": 1.1160714626312256, + "reward_std": 0.168566033244133, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2030.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 362.7812805175781, + "completions/mean_terminated_length": 362.7812805175781, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.12377514182568335, + "grad_norm": 0.2010965794324875, + "kl": 0.015350341796875, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 13546445.0, + "reward": 1.109375, + "reward_std": 0.15386441349983215, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1116071417927742, + "rewards/curriculum_aware_reward_fn/std": 0.31523454189300537, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1668.0, + "completions/max_terminated_length": 1668.0, + "completions/mean_length": 363.0826110839844, + "completions/mean_terminated_length": 363.0826110839844, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.12790097988653945, + "grad_norm": 0.171152725815773, + "kl": 0.01485443115234375, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 13991038.0, + "reward": 1.1272321939468384, + "reward_std": 0.1273893564939499, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, + "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1164.0, + "completions/max_terminated_length": 1164.0, + "completions/mean_length": 357.60491943359375, + "completions/mean_terminated_length": 357.60491943359375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.13202681794739557, + "grad_norm": 0.22520698606967926, + "kl": 0.01274871826171875, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 14421823.0, + "reward": 1.1361607313156128, + "reward_std": 0.17906279861927032, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1361607164144516, + "rewards/curriculum_aware_reward_fn/std": 0.34334251284599304, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1669.0, + "completions/max_terminated_length": 1669.0, + "completions/mean_length": 375.04913330078125, + "completions/mean_terminated_length": 375.04913330078125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.1361526560082517, + "grad_norm": 0.25881803035736084, + "kl": 0.0135498046875, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 14867486.0, + "reward": 1.0825893878936768, + "reward_std": 0.16945692896842957, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.0870535746216774, + "rewards/curriculum_aware_reward_fn/std": 0.2822286784648895, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1557.0, + "completions/max_terminated_length": 1557.0, + "completions/mean_length": 373.47100830078125, + "completions/mean_terminated_length": 373.47100830078125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.14027849406910778, + "grad_norm": 0.22023342549800873, + "kl": 0.01361846923828125, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 15295971.0, + "reward": 1.0982143878936768, + "reward_std": 0.16011416912078857, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1004464253783226, + "rewards/curriculum_aware_reward_fn/std": 0.30093035101890564, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1468.0, + "completions/max_terminated_length": 1468.0, + "completions/mean_length": 359.8437805175781, + "completions/mean_terminated_length": 359.8437805175781, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.1444043321299639, + "grad_norm": 0.2497127801179886, + "kl": 0.01373291015625, + "learning_rate": 1e-06, + "loss": 0.0251, + "num_tokens": 15741577.0, + "reward": 1.1450893878936768, + "reward_std": 0.22232261300086975, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968, + "rewards/curriculum_aware_reward_fn/std": 0.35703200101852417, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1656.0, + "completions/max_terminated_length": 1656.0, + "completions/mean_length": 371.7745666503906, + "completions/mean_terminated_length": 371.7745666503906, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.14853017019082002, + "grad_norm": 0.2346869707107544, + "kl": 0.01375579833984375, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 16177408.0, + "reward": 1.1205357313156128, + "reward_std": 0.19378496706485748, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3303.0, + "completions/max_terminated_length": 3303.0, + "completions/mean_length": 377.1004638671875, + "completions/mean_terminated_length": 377.1004638671875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.15265600825167613, + "grad_norm": 0.20092125236988068, + "kl": 0.011962890625, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 16614421.0, + "reward": 1.087053656578064, + "reward_std": 0.1473119556903839, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.0870535746216774, + "rewards/curriculum_aware_reward_fn/std": 0.2822286784648895, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3311.0, + "completions/max_terminated_length": 3311.0, + "completions/mean_length": 371.7276916503906, + "completions/mean_terminated_length": 371.7276916503906, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.15678184631253222, + "grad_norm": 0.20405592024326324, + "kl": 0.0123443603515625, + "learning_rate": 1e-06, + "loss": 0.0315, + "num_tokens": 17038443.0, + "reward": 1.1294643878936768, + "reward_std": 0.17602959275245667, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2603.0, + "completions/max_terminated_length": 2603.0, + "completions/mean_length": 392.85491943359375, + "completions/mean_terminated_length": 392.85491943359375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.16090768437338834, + "grad_norm": 0.21517439186573029, + "kl": 0.01271820068359375, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 17485083.0, + "reward": 1.0982143878936768, + "reward_std": 0.1524314433336258, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1004464253783226, + "rewards/curriculum_aware_reward_fn/std": 0.30093035101890564, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1344.0, + "completions/max_terminated_length": 1344.0, + "completions/mean_length": 376.9107360839844, + "completions/mean_terminated_length": 376.9107360839844, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.16503352243424446, + "grad_norm": 0.25572851300239563, + "kl": 0.01190948486328125, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 17917580.0, + "reward": 1.102678656578064, + "reward_std": 0.1929987519979477, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1071428582072258, + "rewards/curriculum_aware_reward_fn/std": 0.3096405565738678, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 378.2388610839844, + "completions/mean_terminated_length": 369.92169189453125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.16915936049510058, + "grad_norm": 2.37631893157959, + "kl": 0.1428680419921875, + "learning_rate": 1e-06, + "loss": 0.0265, + "num_tokens": 18357651.0, + "reward": 1.1227679252624512, + "reward_std": 0.15738239884376526, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1205.0, + "completions/max_terminated_length": 1205.0, + "completions/mean_length": 330.35491943359375, + "completions/mean_terminated_length": 330.35491943359375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.17328519855595667, + "grad_norm": 0.2657046616077423, + "kl": 0.01459503173828125, + "learning_rate": 1e-06, + "loss": 0.0266, + "num_tokens": 18747024.0, + "reward": 1.1450893878936768, + "reward_std": 0.22222581505775452, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, + "rewards/curriculum_aware_reward_fn/std": 0.36136937141418457, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2859.0, + "completions/mean_length": 376.16741943359375, + "completions/mean_terminated_length": 367.84564208984375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.1774110366168128, + "grad_norm": 0.23751172423362732, + "kl": 0.011627197265625, + "learning_rate": 1e-06, + "loss": 0.0478, + "num_tokens": 19176227.0, + "reward": 1.165178656578064, + "reward_std": 0.22029609978199005, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, + "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1600.0, + "completions/max_terminated_length": 1600.0, + "completions/mean_length": 371.5602722167969, + "completions/mean_terminated_length": 371.5602722167969, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.1815368746776689, + "grad_norm": 0.22557100653648376, + "kl": 0.0122222900390625, + "learning_rate": 1e-06, + "loss": 0.0307, + "num_tokens": 19588183.0, + "reward": 1.0892857313156128, + "reward_std": 0.19364552199840546, + "rewards/code_format_reward/mean": 0.9888392686843872, + "rewards/code_format_reward/std": 0.10517053306102753, + "rewards/curriculum_aware_reward_fn/mean": 0.1004464253783226, + "rewards/curriculum_aware_reward_fn/std": 0.30093035101890564, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1344.0, + "completions/max_terminated_length": 1344.0, + "completions/mean_length": 381.6026916503906, + "completions/mean_terminated_length": 381.6026916503906, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.18566271273852503, + "grad_norm": 0.24813039600849152, + "kl": 0.0107421875, + "learning_rate": 1e-06, + "loss": 0.0071, + "num_tokens": 20048563.0, + "reward": 1.118303656578064, + "reward_std": 0.18272370100021362, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516, + "rewards/curriculum_aware_reward_fn/std": 0.32595089077949524, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3901.0, + "completions/max_terminated_length": 3901.0, + "completions/mean_length": 373.51116943359375, + "completions/mean_terminated_length": 373.51116943359375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.18978855079938112, + "grad_norm": 0.25957804918289185, + "kl": 0.010589599609375, + "learning_rate": 1e-06, + "loss": 0.021, + "num_tokens": 20484045.0, + "reward": 1.125, + "reward_std": 0.22440890967845917, + "rewards/code_format_reward/mean": 0.9799107313156128, + "rewards/code_format_reward/std": 0.14046260714530945, + "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3525845408439636, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1483.0, + "completions/max_terminated_length": 1483.0, + "completions/mean_length": 377.4888610839844, + "completions/mean_terminated_length": 377.4888610839844, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.19391438886023724, + "grad_norm": 0.24632863700389862, + "kl": 0.010894775390625, + "learning_rate": 1e-06, + "loss": 0.0223, + "num_tokens": 20923290.0, + "reward": 1.1584821939468384, + "reward_std": 0.23042914271354675, + "rewards/code_format_reward/mean": 0.9866071343421936, + "rewards/code_format_reward/std": 0.11507844179868698, + "rewards/curriculum_aware_reward_fn/mean": 0.171875, + "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1178.0, + "completions/max_terminated_length": 1178.0, + "completions/mean_length": 340.9508972167969, + "completions/mean_terminated_length": 340.9508972167969, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.19804022692109335, + "grad_norm": 0.29566898941993713, + "kl": 0.01230621337890625, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 21330717.0, + "reward": 1.149553656578064, + "reward_std": 0.23722697794437408, + "rewards/code_format_reward/mean": 0.984375, + "rewards/code_format_reward/std": 0.12415824085474014, + "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, + "rewards/curriculum_aware_reward_fn/std": 0.37175679206848145, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1488.0, + "completions/max_terminated_length": 1488.0, + "completions/mean_length": 348.87054443359375, + "completions/mean_terminated_length": 348.87054443359375, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.20216606498194944, + "grad_norm": 0.2383718490600586, + "kl": 0.0120697021484375, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 21756945.0, + "reward": 1.087053656578064, + "reward_std": 0.1802413910627365, + "rewards/code_format_reward/mean": 0.984375, + "rewards/code_format_reward/std": 0.12415824085474014, + "rewards/curriculum_aware_reward_fn/mean": 0.1026785746216774, + "rewards/curriculum_aware_reward_fn/std": 0.31115278601646423, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1112.0, + "completions/max_terminated_length": 1112.0, + "completions/mean_length": 348.9754638671875, + "completions/mean_terminated_length": 348.9754638671875, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.20629190304280556, + "grad_norm": 0.2150750309228897, + "kl": 0.01216888427734375, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 22173742.0, + "reward": 1.109375, + "reward_std": 0.1667092889547348, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.1183035746216774, + "rewards/curriculum_aware_reward_fn/std": 0.32332825660705566, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1356.0, + "completions/max_terminated_length": 1356.0, + "completions/mean_length": 340.68304443359375, + "completions/mean_terminated_length": 340.68304443359375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.21041774110366168, + "grad_norm": 0.24686773121356964, + "kl": 0.0117034912109375, + "learning_rate": 1e-06, + "loss": 0.0149, + "num_tokens": 22590268.0, + "reward": 1.149553656578064, + "reward_std": 0.17362166941165924, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, + "rewards/curriculum_aware_reward_fn/std": 0.36136940121650696, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1582.0, + "completions/max_terminated_length": 1582.0, + "completions/mean_length": 364.9888610839844, + "completions/mean_terminated_length": 364.9888610839844, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.2145435791645178, + "grad_norm": 0.24674323201179504, + "kl": 0.011444091796875, + "learning_rate": 1e-06, + "loss": -0.0145, + "num_tokens": 23044836.0, + "reward": 1.1049107313156128, + "reward_std": 0.20420950651168823, + "rewards/code_format_reward/mean": 0.9799107313156128, + "rewards/code_format_reward/std": 0.14046260714530945, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1734.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 349.61163330078125, + "completions/mean_terminated_length": 349.61163330078125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.2186694172253739, + "grad_norm": 0.27109435200691223, + "kl": 0.0108489990234375, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 23465278.0, + "reward": 1.1316965818405151, + "reward_std": 0.2340708076953888, + "rewards/code_format_reward/mean": 0.984375, + "rewards/code_format_reward/std": 0.12415824085474014, + "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3222.0, + "completions/max_terminated_length": 3222.0, + "completions/mean_length": 359.0558166503906, + "completions/mean_terminated_length": 359.0558166503906, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "epoch": 0.22279525528623, + "grad_norm": 0.2768162488937378, + "kl": 0.01114654541015625, + "learning_rate": 1e-06, + "loss": 0.0068, + "num_tokens": 23879647.0, + "reward": 1.118303656578064, + "reward_std": 0.2705444395542145, + "rewards/code_format_reward/mean": 0.9754464030265808, + "rewards/code_format_reward/std": 0.1549331247806549, + "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548, + "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1822.0, + "completions/max_terminated_length": 1822.0, + "completions/mean_length": 355.20538330078125, + "completions/mean_terminated_length": 355.20538330078125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.22692109334708613, + "grad_norm": 0.3178320527076721, + "kl": 0.01139068603515625, + "learning_rate": 1e-06, + "loss": -0.0191, + "num_tokens": 24333101.0, + "reward": 1.071428656578064, + "reward_std": 0.24884316325187683, + "rewards/code_format_reward/mean": 0.9620535969734192, + "rewards/code_format_reward/std": 0.191280335187912, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.3124580383300781, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 361.8750305175781, + "completions/mean_terminated_length": 353.521240234375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.23104693140794225, + "grad_norm": 0.23652935028076172, + "kl": 0.01201629638671875, + "learning_rate": 1e-06, + "loss": 0.0182, + "num_tokens": 24784965.0, + "reward": 1.046875, + "reward_std": 0.16329465806484222, + "rewards/code_format_reward/mean": 0.9754464030265808, + "rewards/code_format_reward/std": 0.1549331247806549, + "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774, + "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2192.0, + "completions/max_terminated_length": 2192.0, + "completions/mean_length": 353.0982360839844, + "completions/mean_terminated_length": 353.0982360839844, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.23517276946879834, + "grad_norm": 0.24375389516353607, + "kl": 0.01190185546875, + "learning_rate": 1e-06, + "loss": 0.0192, + "num_tokens": 25220899.0, + "reward": 1.1450893878936768, + "reward_std": 0.23259766399860382, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, + "rewards/curriculum_aware_reward_fn/std": 0.37354570627212524, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1225.0, + "completions/max_terminated_length": 1225.0, + "completions/mean_length": 322.4375, + "completions/mean_terminated_length": 322.4375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.23929860752965446, + "grad_norm": 0.2876569628715515, + "kl": 0.01218414306640625, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 25626276.0, + "reward": 1.165178656578064, + "reward_std": 0.23654192686080933, + "rewards/code_format_reward/mean": 0.9821428656578064, + "rewards/code_format_reward/std": 0.13258016109466553, + "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516, + "rewards/curriculum_aware_reward_fn/std": 0.387128084897995, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1461.0, + "completions/max_terminated_length": 1461.0, + "completions/mean_length": 352.4129638671875, + "completions/mean_terminated_length": 352.4129638671875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.24342444559051057, + "grad_norm": 0.2399449199438095, + "kl": 0.01129150390625, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 26054007.0, + "reward": 1.1071429252624512, + "reward_std": 0.19277828931808472, + "rewards/code_format_reward/mean": 0.9799107313156128, + "rewards/code_format_reward/std": 0.14046260714530945, + "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, + "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 339.77679443359375, + "completions/mean_terminated_length": 339.77679443359375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.2475502836513667, + "grad_norm": 0.23095594346523285, + "kl": 0.01131439208984375, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 26487718.0, + "reward": 1.133928656578064, + "reward_std": 0.1849469542503357, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548, + "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1105.0, + "completions/max_terminated_length": 1105.0, + "completions/mean_length": 339.27679443359375, + "completions/mean_terminated_length": 339.27679443359375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.2516761217122228, + "grad_norm": 0.25108498334884644, + "kl": 0.0118255615234375, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 26899939.0, + "reward": 1.1540179252624512, + "reward_std": 0.22935986518859863, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3676777780056, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1374.0, + "completions/max_terminated_length": 1374.0, + "completions/mean_length": 358.0692138671875, + "completions/mean_terminated_length": 358.0692138671875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.2558019597730789, + "grad_norm": 0.25201165676116943, + "kl": 0.01178741455078125, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 27344141.0, + "reward": 1.1138393878936768, + "reward_std": 0.2197514921426773, + "rewards/code_format_reward/mean": 0.984375, + "rewards/code_format_reward/std": 0.12415824085474014, + "rewards/curriculum_aware_reward_fn/mean": 0.1294642835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3360883891582489, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1323.0, + "completions/max_terminated_length": 1323.0, + "completions/mean_length": 343.32366943359375, + "completions/mean_terminated_length": 343.32366943359375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.259927797833935, + "grad_norm": 0.28083741664886475, + "kl": 0.01128387451171875, + "learning_rate": 1e-06, + "loss": 0.0269, + "num_tokens": 27760464.0, + "reward": 1.1741071939468384, + "reward_std": 0.25979822874069214, + "rewards/code_format_reward/mean": 0.9866071343421936, + "rewards/code_format_reward/std": 0.11507843434810638, + "rewards/curriculum_aware_reward_fn/mean": 0.1875, + "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2247.0, + "completions/max_terminated_length": 2247.0, + "completions/mean_length": 379.7344055175781, + "completions/mean_terminated_length": 379.7344055175781, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.26405363589479114, + "grad_norm": 0.23068185150623322, + "kl": 0.01107025146484375, + "learning_rate": 1e-06, + "loss": 0.0123, + "num_tokens": 28210466.0, + "reward": 1.1049107313156128, + "reward_std": 0.161203071475029, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843171834946, + "rewards/curriculum_aware_reward_fn/mean": 0.1116071417927742, + "rewards/curriculum_aware_reward_fn/std": 0.315234512090683, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2890.0, + "completions/max_terminated_length": 2890.0, + "completions/mean_length": 363.1094055175781, + "completions/mean_terminated_length": 363.1094055175781, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.26817947395564723, + "grad_norm": 0.24125061929225922, + "kl": 0.01074981689453125, + "learning_rate": 1e-06, + "loss": 0.0108, + "num_tokens": 28642520.0, + "reward": 1.09375, + "reward_std": 0.1778312474489212, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.09375, + "rewards/curriculum_aware_reward_fn/std": 0.2918064594268799, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1161.0, + "completions/max_terminated_length": 1161.0, + "completions/mean_length": 346.2187805175781, + "completions/mean_terminated_length": 346.2187805175781, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.2723053120165034, + "grad_norm": 0.2731747329235077, + "kl": 0.010772705078125, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 29067665.0, + "reward": 1.171875, + "reward_std": 0.23607081174850464, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1763392835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3815346360206604, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 828.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 331.7723388671875, + "completions/mean_terminated_length": 331.7723388671875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.27643115007735947, + "grad_norm": 0.255914568901062, + "kl": 0.0155487060546875, + "learning_rate": 1e-06, + "loss": -0.0129, + "num_tokens": 29473388.0, + "reward": 1.1160714626312256, + "reward_std": 0.16547304391860962, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1969.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 344.6473388671875, + "completions/mean_terminated_length": 344.6473388671875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.28055698813821556, + "grad_norm": 0.23872235417366028, + "kl": 0.0127410888671875, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 29892068.0, + "reward": 1.1629464626312256, + "reward_std": 0.18316583335399628, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, + "rewards/curriculum_aware_reward_fn/std": 0.37175679206848145, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1392.0, + "completions/max_terminated_length": 1392.0, + "completions/mean_length": 345.1629638671875, + "completions/mean_terminated_length": 345.1629638671875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.2846828261990717, + "grad_norm": 0.23450906574726105, + "kl": 0.01248931884765625, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 30304693.0, + "reward": 1.1160714626312256, + "reward_std": 0.20246198773384094, + "rewards/code_format_reward/mean": 0.9888392686843872, + "rewards/code_format_reward/std": 0.10517053306102753, + "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, + "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 351.6607360839844, + "completions/mean_terminated_length": 351.6607360839844, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.2888086642599278, + "grad_norm": 0.2397337406873703, + "kl": 0.01116943359375, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 30717856.0, + "reward": 1.1116071939468384, + "reward_std": 0.19447438418865204, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843171834946, + "rewards/curriculum_aware_reward_fn/mean": 0.1183035746216774, + "rewards/curriculum_aware_reward_fn/std": 0.32332828640937805, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1005.0, + "completions/max_terminated_length": 1005.0, + "completions/mean_length": 340.13616943359375, + "completions/mean_terminated_length": 340.13616943359375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.2929345023207839, + "grad_norm": 0.267532080411911, + "kl": 0.0149688720703125, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 31143909.0, + "reward": 1.1696429252624512, + "reward_std": 0.22853097319602966, + "rewards/code_format_reward/mean": 0.9866071343421936, + "rewards/code_format_reward/std": 0.11507843434810638, + "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516, + "rewards/curriculum_aware_reward_fn/std": 0.387128084897995, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1265.0, + "completions/max_terminated_length": 1265.0, + "completions/mean_length": 346.5870666503906, + "completions/mean_terminated_length": 346.5870666503906, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.29706034038164003, + "grad_norm": 0.29796159267425537, + "kl": 0.01158905029296875, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 31579635.0, + "reward": 1.1741071939468384, + "reward_std": 0.2576758563518524, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516, + "rewards/curriculum_aware_reward_fn/std": 0.387128084897995, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 344.6763610839844, + "completions/mean_terminated_length": 344.6763610839844, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.3011861784424961, + "grad_norm": 0.2371477484703064, + "kl": 0.01210784912109375, + "learning_rate": 1e-06, + "loss": 0.0122, + "num_tokens": 31999301.0, + "reward": 1.140625, + "reward_std": 0.18199601769447327, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548, + "rewards/curriculum_aware_reward_fn/std": 0.35664716362953186, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3149.0, + "completions/max_terminated_length": 3149.0, + "completions/mean_length": 387.5067138671875, + "completions/mean_terminated_length": 387.5067138671875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.30531201650335227, + "grad_norm": 0.20645320415496826, + "kl": 0.0110015869140625, + "learning_rate": 1e-06, + "loss": 0.0236, + "num_tokens": 32465707.0, + "reward": 1.1294643878936768, + "reward_std": 0.17819245159626007, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1343.0, + "completions/max_terminated_length": 1343.0, + "completions/mean_length": 362.12725830078125, + "completions/mean_terminated_length": 362.12725830078125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.30943785456420836, + "grad_norm": 0.23689766228199005, + "kl": 0.0115509033203125, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 32894283.0, + "reward": 1.1116071939468384, + "reward_std": 0.1989629715681076, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516, + "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1811.0, + "completions/max_terminated_length": 1811.0, + "completions/mean_length": 378.2901916503906, + "completions/mean_terminated_length": 378.2901916503906, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.31356369262506445, + "grad_norm": 0.2228821963071823, + "kl": 0.0117340087890625, + "learning_rate": 1e-06, + "loss": 0.0148, + "num_tokens": 33348734.0, + "reward": 1.1383929252624512, + "reward_std": 0.1881648153066635, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.3480229377746582, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2446.0, + "completions/max_terminated_length": 2446.0, + "completions/mean_length": 400.8750305175781, + "completions/mean_terminated_length": 400.8750305175781, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.3176895306859206, + "grad_norm": 0.2140088975429535, + "kl": 0.0146026611328125, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 33807640.0, + "reward": 1.1674107313156128, + "reward_std": 0.18470536172389984, + "rewards/code_format_reward/mean": 0.9888392686843872, + "rewards/code_format_reward/std": 0.10517053306102753, + "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.42229342460632324, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1205.0, + "completions/max_terminated_length": 1205.0, + "completions/mean_length": 372.58929443359375, + "completions/mean_terminated_length": 372.58929443359375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.3218153687467767, + "grad_norm": 0.24875551462173462, + "kl": 0.01198577880859375, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 34250108.0, + "reward": 1.165178656578064, + "reward_std": 0.20772093534469604, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452, + "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1497.0, + "completions/mean_length": 375.45538330078125, + "completions/mean_terminated_length": 367.1319885253906, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.3259412068076328, + "grad_norm": 0.24669714272022247, + "kl": 0.01181793212890625, + "learning_rate": 1e-06, + "loss": 0.0246, + "num_tokens": 34689597.0, + "reward": 1.133928656578064, + "reward_std": 0.17734427750110626, + "rewards/code_format_reward/mean": 0.9888392686843872, + "rewards/code_format_reward/std": 0.10517053306102753, + "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, + "rewards/curriculum_aware_reward_fn/std": 0.352584570646286, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1325.0, + "completions/max_terminated_length": 1325.0, + "completions/mean_length": 345.0714416503906, + "completions/mean_terminated_length": 345.0714416503906, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.3300670448684889, + "grad_norm": 0.221415176987648, + "kl": 0.012115478515625, + "learning_rate": 1e-06, + "loss": 0.0223, + "num_tokens": 35106479.0, + "reward": 1.1473214626312256, + "reward_std": 0.1726084053516388, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968, + "rewards/curriculum_aware_reward_fn/std": 0.3570319712162018, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1138.0, + "completions/max_terminated_length": 1138.0, + "completions/mean_length": 360.6227722167969, + "completions/mean_terminated_length": 360.6227722167969, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.334192882929345, + "grad_norm": 0.21861205995082855, + "kl": 0.01169586181640625, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 35539442.0, + "reward": 1.1897321939468384, + "reward_std": 0.20504766702651978, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548, + "rewards/curriculum_aware_reward_fn/std": 0.4253509044647217, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3323.0, + "completions/max_terminated_length": 3323.0, + "completions/mean_length": 371.7656555175781, + "completions/mean_terminated_length": 371.7656555175781, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.33831872099020116, + "grad_norm": 0.24164587259292603, + "kl": 0.0117950439453125, + "learning_rate": 1e-06, + "loss": 0.0235, + "num_tokens": 35974317.0, + "reward": 1.165178656578064, + "reward_std": 0.2279203236103058, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452, + "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1651.0, + "completions/max_terminated_length": 1651.0, + "completions/mean_length": 372.0469055175781, + "completions/mean_terminated_length": 372.0469055175781, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.34244455905105725, + "grad_norm": 0.27285560965538025, + "kl": 0.0119781494140625, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 36432978.0, + "reward": 1.1919643878936768, + "reward_std": 0.24956144392490387, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, + "rewards/curriculum_aware_reward_fn/std": 0.39774051308631897, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1329.0, + "completions/max_terminated_length": 1329.0, + "completions/mean_length": 395.25225830078125, + "completions/mean_terminated_length": 395.25225830078125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.34657039711191334, + "grad_norm": 0.220729798078537, + "kl": 0.01096343994140625, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 36892984.0, + "reward": 1.1205358505249023, + "reward_std": 0.18371452391147614, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516, + "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3794.0, + "completions/max_terminated_length": 3794.0, + "completions/mean_length": 372.86163330078125, + "completions/mean_terminated_length": 372.86163330078125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.3506962351727695, + "grad_norm": 0.2561819851398468, + "kl": 0.0115966796875, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 37340754.0, + "reward": 1.0915179252624512, + "reward_std": 0.2180769443511963, + "rewards/code_format_reward/mean": 0.984375, + "rewards/code_format_reward/std": 0.12415824085474014, + "rewards/curriculum_aware_reward_fn/mean": 0.1071428582072258, + "rewards/curriculum_aware_reward_fn/std": 0.3096405565738678, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 340.00225830078125, + "completions/mean_terminated_length": 331.59954833984375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.3548220732336256, + "grad_norm": 0.25818702578544617, + "kl": 0.012969970703125, + "learning_rate": 1e-06, + "loss": 0.044, + "num_tokens": 37750619.0, + "reward": 1.15625, + "reward_std": 0.2230120152235031, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548, + "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 349.18975830078125, + "completions/mean_terminated_length": 349.18975830078125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.35894791129448167, + "grad_norm": 0.21370843052864075, + "kl": 0.0142059326171875, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 38191410.0, + "reward": 1.140625, + "reward_std": 0.16670270264148712, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548, + "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1830.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 355.7187805175781, + "completions/mean_terminated_length": 355.7187805175781, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.3630737493553378, + "grad_norm": 0.243008553981781, + "kl": 0.0123291015625, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 38626488.0, + "reward": 1.1383929252624512, + "reward_std": 0.19419102370738983, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, + "rewards/curriculum_aware_reward_fn/std": 0.352584570646286, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1263.0, + "completions/max_terminated_length": 1263.0, + "completions/mean_length": 361.5245666503906, + "completions/mean_terminated_length": 361.5245666503906, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.3671995874161939, + "grad_norm": 0.23066678643226624, + "kl": 0.0127716064453125, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 39070327.0, + "reward": 1.1361607313156128, + "reward_std": 0.19861890375614166, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843171834946, + "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548, + "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1316.0, + "completions/max_terminated_length": 1316.0, + "completions/mean_length": 344.4508972167969, + "completions/mean_terminated_length": 344.4508972167969, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.37132542547705005, + "grad_norm": 0.22873297333717346, + "kl": 0.0172271728515625, + "learning_rate": 1e-06, + "loss": 0.0152, + "num_tokens": 39504172.0, + "reward": 1.118303656578064, + "reward_std": 0.18025504052639008, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1227678582072258, + "rewards/curriculum_aware_reward_fn/std": 0.3285374343395233, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1097.0, + "completions/max_terminated_length": 1097.0, + "completions/mean_length": 337.7276916503906, + "completions/mean_terminated_length": 337.7276916503906, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.37545126353790614, + "grad_norm": 0.2452411651611328, + "kl": 0.01430511474609375, + "learning_rate": 1e-06, + "loss": 0.013, + "num_tokens": 39933654.0, + "reward": 1.1316964626312256, + "reward_std": 0.1899057924747467, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1339285671710968, + "rewards/curriculum_aware_reward_fn/std": 0.34095630049705505, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1061.0, + "completions/max_terminated_length": 1061.0, + "completions/mean_length": 333.578125, + "completions/mean_terminated_length": 333.578125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.37957710159876223, + "grad_norm": 0.2378436028957367, + "kl": 0.01314544677734375, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 40341761.0, + "reward": 1.15625, + "reward_std": 0.2169431895017624, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.37560540437698364, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 324.9598388671875, + "completions/mean_terminated_length": 324.9598388671875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.3837029396596184, + "grad_norm": 0.2310194969177246, + "kl": 0.01369476318359375, + "learning_rate": 1e-06, + "loss": 0.0213, + "num_tokens": 40760812.0, + "reward": 1.149553656578064, + "reward_std": 0.198617085814476, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843171834946, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.3873344361782074, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1514.0, + "completions/max_terminated_length": 1514.0, + "completions/mean_length": 342.8951110839844, + "completions/mean_terminated_length": 342.8951110839844, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.38782877772047447, + "grad_norm": 0.2386288344860077, + "kl": 0.013671875, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 41179939.0, + "reward": 1.15625, + "reward_std": 0.2045021653175354, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548, + "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 960.0, + "completions/max_terminated_length": 960.0, + "completions/mean_length": 331.078125, + "completions/mean_terminated_length": 331.078125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.39195461578133056, + "grad_norm": 0.25872641801834106, + "kl": 0.01519775390625, + "learning_rate": 1e-06, + "loss": 0.0124, + "num_tokens": 41603992.0, + "reward": 1.1763393878936768, + "reward_std": 0.20531079173088074, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3834211826324463, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3766.0, + "completions/max_terminated_length": 3766.0, + "completions/mean_length": 349.6160888671875, + "completions/mean_terminated_length": 349.6160888671875, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.3960804538421867, + "grad_norm": 0.22162525355815887, + "kl": 0.01556396484375, + "learning_rate": 1e-06, + "loss": 0.0156, + "num_tokens": 42031849.0, + "reward": 1.180803656578064, + "reward_std": 0.1656801700592041, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548, + "rewards/curriculum_aware_reward_fn/std": 0.39252743124961853, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 337.1763610839844, + "completions/mean_terminated_length": 337.1763610839844, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.4002062919030428, + "grad_norm": 0.26754871010780334, + "kl": 0.01500701904296875, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 42448883.0, + "reward": 1.2321429252624512, + "reward_std": 0.2509976029396057, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.2388392835855484, + "rewards/curriculum_aware_reward_fn/std": 0.4522976279258728, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1271.0, + "completions/max_terminated_length": 1271.0, + "completions/mean_length": 341.90850830078125, + "completions/mean_terminated_length": 341.90850830078125, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.4043321299638989, + "grad_norm": 0.26659685373306274, + "kl": 0.0152587890625, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 42877732.0, + "reward": 1.2008929252624512, + "reward_std": 0.23461629450321198, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.2098214328289032, + "rewards/curriculum_aware_reward_fn/std": 0.42378073930740356, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 319.0558166503906, + "completions/mean_terminated_length": 319.0558166503906, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.40845796802475504, + "grad_norm": 0.24595794081687927, + "kl": 0.0162200927734375, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 43290883.0, + "reward": 1.1875, + "reward_std": 0.20630162954330444, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1875, + "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 328.95538330078125, + "completions/mean_terminated_length": 320.5279541015625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.4125838060856111, + "grad_norm": 0.21619565784931183, + "kl": 0.015472412109375, + "learning_rate": 1e-06, + "loss": 0.027, + "num_tokens": 43702890.0, + "reward": 1.15625, + "reward_std": 0.15517687797546387, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3676777780056, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 938.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 339.11163330078125, + "completions/mean_terminated_length": 339.11163330078125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.4167096441464673, + "grad_norm": 0.21944324672222137, + "kl": 0.014801025390625, + "learning_rate": 1e-06, + "loss": 0.0124, + "num_tokens": 44137822.0, + "reward": 1.1361607313156128, + "reward_std": 0.195481076836586, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1361607164144516, + "rewards/curriculum_aware_reward_fn/std": 0.35613569617271423, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1717.0, + "completions/max_terminated_length": 1717.0, + "completions/mean_length": 325.9352722167969, + "completions/mean_terminated_length": 325.9352722167969, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.42083548220732336, + "grad_norm": 0.24191616475582123, + "kl": 0.0155029296875, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 44569189.0, + "reward": 1.1875, + "reward_std": 0.1999538093805313, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1875, + "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2173.0, + "completions/max_terminated_length": 2173.0, + "completions/mean_length": 332.6317138671875, + "completions/mean_terminated_length": 332.6317138671875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.42496132026817945, + "grad_norm": 0.24171848595142365, + "kl": 0.0145721435546875, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 44998642.0, + "reward": 1.1294643878936768, + "reward_std": 0.2073148936033249, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1339285671710968, + "rewards/curriculum_aware_reward_fn/std": 0.34095630049705505, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1289.0, + "completions/max_terminated_length": 1289.0, + "completions/mean_length": 328.2901916503906, + "completions/mean_terminated_length": 328.2901916503906, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.4290871583290356, + "grad_norm": 0.2536793649196625, + "kl": 0.01633453369140625, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 45401699.0, + "reward": 1.102678656578064, + "reward_std": 0.19625361263751984, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1049107164144516, + "rewards/curriculum_aware_reward_fn/std": 0.3067808747291565, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 293.4933166503906, + "completions/mean_terminated_length": 293.4933166503906, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.4332129963898917, + "grad_norm": 0.26053330302238464, + "kl": 0.017364501953125, + "learning_rate": 1e-06, + "loss": 0.0221, + "num_tokens": 45793262.0, + "reward": 1.227678656578064, + "reward_std": 0.2376893311738968, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.2299107164144516, + "rewards/curriculum_aware_reward_fn/std": 0.42124560475349426, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 305.71429443359375, + "completions/mean_terminated_length": 305.71429443359375, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.4373388344507478, + "grad_norm": 0.22793447971343994, + "kl": 0.0171966552734375, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 46204258.0, + "reward": 1.1383929252624512, + "reward_std": 0.16399335861206055, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1383928507566452, + "rewards/curriculum_aware_reward_fn/std": 0.34569787979125977, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 977.0, + "completions/max_terminated_length": 977.0, + "completions/mean_length": 309.25225830078125, + "completions/mean_terminated_length": 309.25225830078125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.44146467251160393, + "grad_norm": 0.6037999391555786, + "kl": 0.0637969970703125, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 46595608.0, + "reward": 1.212053656578064, + "reward_std": 0.2621622085571289, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452, + "rewards/curriculum_aware_reward_fn/std": 0.41233164072036743, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 878.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 295.12725830078125, + "completions/mean_terminated_length": 295.12725830078125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.44559051057246, + "grad_norm": 0.2791634202003479, + "kl": 0.01679229736328125, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 46995953.0, + "reward": 1.1897321939468384, + "reward_std": 0.25247541069984436, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3960230052471161, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1053.0, + "completions/max_terminated_length": 1053.0, + "completions/mean_length": 300.75225830078125, + "completions/mean_terminated_length": 300.75225830078125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.44971634863331617, + "grad_norm": 0.259223073720932, + "kl": 0.01723480224609375, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 47398069.0, + "reward": 1.1986607313156128, + "reward_std": 0.18366967141628265, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1986607164144516, + "rewards/curriculum_aware_reward_fn/std": 0.3994380831718445, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 306.62725830078125, + "completions/mean_terminated_length": 306.62725830078125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.45384218669417226, + "grad_norm": 0.26962050795555115, + "kl": 0.01969146728515625, + "learning_rate": 1e-06, + "loss": -0.0116, + "num_tokens": 47807458.0, + "reward": 1.1540179252624512, + "reward_std": 0.19651676714420319, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548, + "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 783.0, + "completions/max_terminated_length": 783.0, + "completions/mean_length": 281.5201110839844, + "completions/mean_terminated_length": 281.5201110839844, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.45796802475502835, + "grad_norm": 0.288915753364563, + "kl": 0.0187835693359375, + "learning_rate": 1e-06, + "loss": 0.0097, + "num_tokens": 48179027.0, + "reward": 1.2678571939468384, + "reward_std": 0.230044886469841, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.2723214328289032, + "rewards/curriculum_aware_reward_fn/std": 0.4456520676612854, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1853.0, + "completions/max_terminated_length": 1853.0, + "completions/mean_length": 309.90850830078125, + "completions/mean_terminated_length": 309.90850830078125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.4620938628158845, + "grad_norm": 0.3339619040489197, + "kl": 0.03505706787109375, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 48599777.0, + "reward": 1.1428571939468384, + "reward_std": 0.17816074192523956, + "rewards/code_format_reward/mean": 0.9888392686843872, + "rewards/code_format_reward/std": 0.10517053306102753, + "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, + "rewards/curriculum_aware_reward_fn/std": 0.36136940121650696, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 984.0, + "completions/max_terminated_length": 984.0, + "completions/mean_length": 302.5915222167969, + "completions/mean_terminated_length": 302.5915222167969, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.4662197008767406, + "grad_norm": 0.2208058387041092, + "kl": 0.0186004638671875, + "learning_rate": 1e-06, + "loss": 0.0224, + "num_tokens": 49020525.0, + "reward": 1.1540179252624512, + "reward_std": 0.16597501933574677, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, + "rewards/curriculum_aware_reward_fn/std": 0.36136937141418457, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 291.4821472167969, + "completions/mean_terminated_length": 291.4821472167969, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.4703455389375967, + "grad_norm": 0.3014701306819916, + "kl": 0.021331787109375, + "learning_rate": 1e-06, + "loss": 0.0208, + "num_tokens": 49403298.0, + "reward": 1.149553656578064, + "reward_std": 0.2552274167537689, + "rewards/code_format_reward/mean": 0.9888392686843872, + "rewards/code_format_reward/std": 0.10517053306102753, + "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3676777780056, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1006.0, + "completions/max_terminated_length": 1006.0, + "completions/mean_length": 295.80804443359375, + "completions/mean_terminated_length": 295.80804443359375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.4744713769984528, + "grad_norm": 0.28405463695526123, + "kl": 0.020538330078125, + "learning_rate": 1e-06, + "loss": 0.0215, + "num_tokens": 49797277.0, + "reward": 1.1540180444717407, + "reward_std": 0.25069838762283325, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843171834946, + "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3737127482891083, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 320.63616943359375, + "completions/mean_terminated_length": 312.1901550292969, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.4785972150593089, + "grad_norm": 0.24680057168006897, + "kl": 0.016876220703125, + "learning_rate": 1e-06, + "loss": 0.0261, + "num_tokens": 50206376.0, + "reward": 1.1674107313156128, + "reward_std": 0.22103038430213928, + "rewards/code_format_reward/mean": 0.984375, + "rewards/code_format_reward/std": 0.12415824085474014, + "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516, + "rewards/curriculum_aware_reward_fn/std": 0.387128084897995, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1170.0, + "completions/max_terminated_length": 1170.0, + "completions/mean_length": 312.59375, + "completions/mean_terminated_length": 312.59375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.48272305312016506, + "grad_norm": 0.278852641582489, + "kl": 0.01776885986328125, + "learning_rate": 1e-06, + "loss": 0.0177, + "num_tokens": 50625368.0, + "reward": 1.15625, + "reward_std": 0.21217124164104462, + "rewards/code_format_reward/mean": 0.9888392686843872, + "rewards/code_format_reward/std": 0.10517053306102753, + "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, + "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 977.0, + "completions/max_terminated_length": 977.0, + "completions/mean_length": 288.9598388671875, + "completions/mean_terminated_length": 288.9598388671875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.48684889118102115, + "grad_norm": 0.29333117604255676, + "kl": 0.01859283447265625, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 51026044.0, + "reward": 1.1517857313156128, + "reward_std": 0.23571307957172394, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3676777780056, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1755.0, + "completions/max_terminated_length": 1755.0, + "completions/mean_length": 304.8258972167969, + "completions/mean_terminated_length": 304.8258972167969, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.49097472924187724, + "grad_norm": 0.23791854083538055, + "kl": 0.017547607421875, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 51435060.0, + "reward": 1.149553656578064, + "reward_std": 0.1614886224269867, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, + "rewards/curriculum_aware_reward_fn/std": 0.40789952874183655, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1238.0, + "completions/max_terminated_length": 1238.0, + "completions/mean_length": 309.35491943359375, + "completions/mean_terminated_length": 309.35491943359375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.4951005673027334, + "grad_norm": 0.32031339406967163, + "kl": 0.02161407470703125, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 51836242.0, + "reward": 1.180803656578064, + "reward_std": 0.3114909827709198, + "rewards/code_format_reward/mean": 0.9776785969734192, + "rewards/code_format_reward/std": 0.1478918492794037, + "rewards/curriculum_aware_reward_fn/mean": 0.203125, + "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.004464285714285698, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 891.0, + "completions/mean_length": 310.6138610839844, + "completions/mean_terminated_length": 293.6390380859375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.4992264053635895, + "grad_norm": 0.26912397146224976, + "kl": 0.017120361328125, + "learning_rate": 1e-06, + "loss": 0.0528, + "num_tokens": 52242545.0, + "reward": 1.1361608505249023, + "reward_std": 0.26077795028686523, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1616371124982834, + "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1507.0, + "completions/max_terminated_length": 1507.0, + "completions/mean_length": 304.5602722167969, + "completions/mean_terminated_length": 304.5602722167969, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.5033522434244456, + "grad_norm": 0.28717559576034546, + "kl": 0.016204833984375, + "learning_rate": 1e-06, + "loss": 0.0229, + "num_tokens": 52644883.0, + "reward": 1.2455357313156128, + "reward_std": 0.29382893443107605, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.25, + "rewards/curriculum_aware_reward_fn/std": 0.4823506772518158, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1113.0, + "completions/max_terminated_length": 1113.0, + "completions/mean_length": 324.8370666503906, + "completions/mean_terminated_length": 324.8370666503906, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.5074780814853017, + "grad_norm": 0.28405648469924927, + "kl": 0.01723480224609375, + "learning_rate": 1e-06, + "loss": 0.0218, + "num_tokens": 53072097.0, + "reward": 1.1540179252624512, + "reward_std": 0.2212349772453308, + "rewards/code_format_reward/mean": 0.9866071343421936, + "rewards/code_format_reward/std": 0.11507844179868698, + "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, + "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 294.9910888671875, + "completions/mean_terminated_length": 294.9910888671875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.5116039195461578, + "grad_norm": 0.25819990038871765, + "kl": 0.0186614990234375, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 53460320.0, + "reward": 1.149553656578064, + "reward_std": 0.20559635758399963, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, + "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1141.0, + "completions/max_terminated_length": 1141.0, + "completions/mean_length": 307.7723388671875, + "completions/mean_terminated_length": 307.7723388671875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.5157297576070139, + "grad_norm": 0.29429891705513, + "kl": 0.0176544189453125, + "learning_rate": 1e-06, + "loss": 0.0331, + "num_tokens": 53860688.0, + "reward": 1.196428656578064, + "reward_std": 0.22343392670154572, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843171834946, + "rewards/curriculum_aware_reward_fn/mean": 0.203125, + "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 293.2321472167969, + "completions/mean_terminated_length": 293.2321472167969, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.51985559566787, + "grad_norm": 0.30082517862319946, + "kl": 0.0195159912109375, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 54251469.0, + "reward": 1.212053656578064, + "reward_std": 0.21257728338241577, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.2209821492433548, + "rewards/curriculum_aware_reward_fn/std": 0.4153723120689392, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2435.0, + "completions/max_terminated_length": 2435.0, + "completions/mean_length": 318.7946472167969, + "completions/mean_terminated_length": 318.7946472167969, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.5239814337287262, + "grad_norm": 0.27134010195732117, + "kl": 0.01700592041015625, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 54667695.0, + "reward": 1.1383929252624512, + "reward_std": 0.2325977087020874, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548, + "rewards/curriculum_aware_reward_fn/std": 0.3566471338272095, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1320.0, + "completions/max_terminated_length": 1320.0, + "completions/mean_length": 321.45538330078125, + "completions/mean_terminated_length": 321.45538330078125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.5281072717895823, + "grad_norm": 0.23834328353405, + "kl": 0.01728057861328125, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 55078358.0, + "reward": 1.1607143878936768, + "reward_std": 0.15348079800605774, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1619.0, + "completions/max_terminated_length": 1619.0, + "completions/mean_length": 337.97991943359375, + "completions/mean_terminated_length": 337.97991943359375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.5322331098504384, + "grad_norm": 0.24708496034145355, + "kl": 0.0186614990234375, + "learning_rate": 1e-06, + "loss": 0.0155, + "num_tokens": 55527742.0, + "reward": 1.127232313156128, + "reward_std": 0.2065713256597519, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.004464285714285698, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1139.0, + "completions/mean_length": 329.22991943359375, + "completions/mean_terminated_length": 312.3385925292969, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.5363589479112945, + "grad_norm": 0.2480604648590088, + "kl": 0.020965576171875, + "learning_rate": 1e-06, + "loss": 0.0453, + "num_tokens": 55941881.0, + "reward": 1.1696429252624512, + "reward_std": 0.2080267071723938, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.41156184673309326, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1275.0, + "completions/max_terminated_length": 1275.0, + "completions/mean_length": 324.04241943359375, + "completions/mean_terminated_length": 324.04241943359375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.5404847859721505, + "grad_norm": 0.27031761407852173, + "kl": 0.0172119140625, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 56365057.0, + "reward": 1.1607143878936768, + "reward_std": 0.20167233049869537, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3676777780056, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1266.0, + "completions/max_terminated_length": 1266.0, + "completions/mean_length": 332.5133972167969, + "completions/mean_terminated_length": 332.5133972167969, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.5446106240330068, + "grad_norm": 0.2554526627063751, + "kl": 0.0162353515625, + "learning_rate": 1e-06, + "loss": 0.0179, + "num_tokens": 56796550.0, + "reward": 1.1629464626312256, + "reward_std": 0.19279412925243378, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 998.0, + "completions/max_terminated_length": 998.0, + "completions/mean_length": 320.7076110839844, + "completions/mean_terminated_length": 320.7076110839844, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.5487364620938628, + "grad_norm": 0.27639445662498474, + "kl": 0.0175933837890625, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 57218694.0, + "reward": 1.140625, + "reward_std": 0.2198539674282074, + "rewards/code_format_reward/mean": 0.9888392686843872, + "rewards/code_format_reward/std": 0.10517053306102753, + "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, + "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1035.0, + "completions/max_terminated_length": 1035.0, + "completions/mean_length": 330.8214416503906, + "completions/mean_terminated_length": 330.8214416503906, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.5528623001547189, + "grad_norm": 0.27146339416503906, + "kl": 0.01773834228515625, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 57637410.0, + "reward": 1.1383929252624512, + "reward_std": 0.19378496706485748, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.140625, + "rewards/curriculum_aware_reward_fn/std": 0.3480229377746582, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 362.9352722167969, + "completions/mean_terminated_length": 354.5838928222656, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.556988138215575, + "grad_norm": 0.2487378716468811, + "kl": 0.017333984375, + "learning_rate": 1e-06, + "loss": 0.0321, + "num_tokens": 58083673.0, + "reward": 1.1696429252624512, + "reward_std": 0.20776358246803284, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843171834946, + "rewards/curriculum_aware_reward_fn/mean": 0.1763392835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3873537480831146, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1828.0, + "completions/max_terminated_length": 1828.0, + "completions/mean_length": 327.7410888671875, + "completions/mean_terminated_length": 327.7410888671875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.5611139762764311, + "grad_norm": 0.27792710065841675, + "kl": 0.019989013671875, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 58492336.0, + "reward": 1.1919643878936768, + "reward_std": 0.24865475296974182, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, + "rewards/curriculum_aware_reward_fn/std": 0.39774051308631897, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1106.0, + "completions/max_terminated_length": 1106.0, + "completions/mean_length": 347.3214416503906, + "completions/mean_terminated_length": 347.3214416503906, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.5652398143372873, + "grad_norm": 0.22186486423015594, + "kl": 0.018646240234375, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 58913446.0, + "reward": 1.1540179252624512, + "reward_std": 0.1549137532711029, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.36349809169769287, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1071.0, + "completions/max_terminated_length": 1071.0, + "completions/mean_length": 346.89288330078125, + "completions/mean_terminated_length": 346.89288330078125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.5693656523981434, + "grad_norm": 0.2620648443698883, + "kl": 0.0197601318359375, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 59344787.0, + "reward": 1.149553656578064, + "reward_std": 0.19237443804740906, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968, + "rewards/curriculum_aware_reward_fn/std": 0.36324387788772583, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 325.52679443359375, + "completions/mean_terminated_length": 325.52679443359375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.5734914904589995, + "grad_norm": 0.25917235016822815, + "kl": 0.0196533203125, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 59741384.0, + "reward": 1.171875238418579, + "reward_std": 0.23336145281791687, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1763392835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3815346360206604, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3192.0, + "completions/max_terminated_length": 3192.0, + "completions/mean_length": 350.24554443359375, + "completions/mean_terminated_length": 350.24554443359375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.5776173285198556, + "grad_norm": 0.2704315781593323, + "kl": 0.01824951171875, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 60173646.0, + "reward": 1.165178656578064, + "reward_std": 0.20555807650089264, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, + "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 348.03350830078125, + "completions/mean_terminated_length": 348.03350830078125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.5817431665807117, + "grad_norm": 0.24649009108543396, + "kl": 0.01763916015625, + "learning_rate": 1e-06, + "loss": 0.0198, + "num_tokens": 60600283.0, + "reward": 1.1450893878936768, + "reward_std": 0.20083805918693542, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, + "rewards/curriculum_aware_reward_fn/std": 0.352584570646286, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1494.0, + "completions/max_terminated_length": 1494.0, + "completions/mean_length": 369.7388610839844, + "completions/mean_terminated_length": 369.7388610839844, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.5858690046415678, + "grad_norm": 0.2548430562019348, + "kl": 0.01636505126953125, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 61055684.0, + "reward": 1.1696429252624512, + "reward_std": 0.2090558409690857, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452, + "rewards/curriculum_aware_reward_fn/std": 0.3816458284854889, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 992.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 353.56475830078125, + "completions/mean_terminated_length": 353.56475830078125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.589994842702424, + "grad_norm": 0.2363528460264206, + "kl": 0.01692962646484375, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 61495544.0, + "reward": 1.149553656578064, + "reward_std": 0.20312771201133728, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968, + "rewards/curriculum_aware_reward_fn/std": 0.35703200101852417, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 924.0, + "completions/max_terminated_length": 924.0, + "completions/mean_length": 346.1004638671875, + "completions/mean_terminated_length": 346.1004638671875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.5941206807632801, + "grad_norm": 0.25129303336143494, + "kl": 0.01854705810546875, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 61909660.0, + "reward": 1.1584821939468384, + "reward_std": 0.20729246735572815, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548, + "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1139.0, + "completions/max_terminated_length": 1139.0, + "completions/mean_length": 369.67413330078125, + "completions/mean_terminated_length": 369.67413330078125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.5982465188241362, + "grad_norm": 0.2561042606830597, + "kl": 0.01654052734375, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 62344613.0, + "reward": 1.140625, + "reward_std": 0.22326858341693878, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548, + "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2887.0, + "completions/mean_length": 360.9687805175781, + "completions/mean_terminated_length": 352.61297607421875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.6023723568849922, + "grad_norm": 0.25807270407676697, + "kl": 0.0168609619140625, + "learning_rate": 1e-06, + "loss": 0.0234, + "num_tokens": 62762009.0, + "reward": 1.2142857313156128, + "reward_std": 0.2551623582839966, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452, + "rewards/curriculum_aware_reward_fn/std": 0.4334910213947296, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1202.0, + "completions/mean_length": 378.3750305175781, + "completions/mean_terminated_length": 370.05816650390625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.6064981949458483, + "grad_norm": 0.3243965208530426, + "kl": 0.0496826171875, + "learning_rate": 1e-06, + "loss": 0.0411, + "num_tokens": 63197325.0, + "reward": 1.1540179252624512, + "reward_std": 0.21813544631004333, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548, + "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1059.0, + "completions/max_terminated_length": 1059.0, + "completions/mean_length": 354.47991943359375, + "completions/mean_terminated_length": 354.47991943359375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.6106240330067045, + "grad_norm": 0.27332666516304016, + "kl": 0.0201263427734375, + "learning_rate": 1e-06, + "loss": 0.0223, + "num_tokens": 63627732.0, + "reward": 1.140625, + "reward_std": 0.24266810715198517, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3525845408439636, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1062.0, + "completions/max_terminated_length": 1062.0, + "completions/mean_length": 355.8035888671875, + "completions/mean_terminated_length": 355.8035888671875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.6147498710675606, + "grad_norm": 0.24161125719547272, + "kl": 0.016448974609375, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 64075567.0, + "reward": 1.1473214626312256, + "reward_std": 0.21147961914539337, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 390.1383972167969, + "completions/mean_terminated_length": 390.1383972167969, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.6188757091284167, + "grad_norm": 0.19339683651924133, + "kl": 0.01575469970703125, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 64536040.0, + "reward": 1.118303656578064, + "reward_std": 0.16575203835964203, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516, + "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1122.0, + "completions/max_terminated_length": 1122.0, + "completions/mean_length": 346.2812805175781, + "completions/mean_terminated_length": 346.2812805175781, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.6230015471892728, + "grad_norm": 0.2566029727458954, + "kl": 0.0163116455078125, + "learning_rate": 1e-06, + "loss": 0.021, + "num_tokens": 64954262.0, + "reward": 1.1629464626312256, + "reward_std": 0.23038895428180695, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, + "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 344.9687805175781, + "completions/mean_terminated_length": 344.9687805175781, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.6271273852501289, + "grad_norm": 0.2446659654378891, + "kl": 0.01611328125, + "learning_rate": 1e-06, + "loss": 0.0024, + "num_tokens": 65365750.0, + "reward": 1.1473214626312256, + "reward_std": 0.17018462717533112, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, + "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1171.0, + "completions/max_terminated_length": 1171.0, + "completions/mean_length": 348.5245666503906, + "completions/mean_terminated_length": 348.5245666503906, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.631253223310985, + "grad_norm": 0.28293153643608093, + "kl": 0.03037261962890625, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 65796226.0, + "reward": 1.1875, + "reward_std": 0.25763100385665894, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3942854404449463, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 370.1071472167969, + "completions/mean_terminated_length": 370.1071472167969, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.6353790613718412, + "grad_norm": 0.24039463698863983, + "kl": 0.015869140625, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 66231975.0, + "reward": 1.1517857313156128, + "reward_std": 0.21372443437576294, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, + "rewards/curriculum_aware_reward_fn/std": 0.36136937141418457, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1067.0, + "completions/max_terminated_length": 1067.0, + "completions/mean_length": 394.8258972167969, + "completions/mean_terminated_length": 394.8258972167969, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.6395048994326973, + "grad_norm": 0.1961672157049179, + "kl": 0.01488494873046875, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 66691440.0, + "reward": 1.140625, + "reward_std": 0.12813948094844818, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548, + "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1462.0, + "completions/max_terminated_length": 1462.0, + "completions/mean_length": 352.5848388671875, + "completions/mean_terminated_length": 352.5848388671875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.6436307374935534, + "grad_norm": 0.25957784056663513, + "kl": 0.0162506103515625, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 67124834.0, + "reward": 1.203125, + "reward_std": 0.219389408826828, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.2053571492433548, + "rewards/curriculum_aware_reward_fn/std": 0.40441396832466125, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1036.0, + "completions/max_terminated_length": 1036.0, + "completions/mean_length": 358.00225830078125, + "completions/mean_terminated_length": 358.00225830078125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.6477565755544095, + "grad_norm": 0.24108652770519257, + "kl": 0.01555633544921875, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 67562832.0, + "reward": 1.1584821939468384, + "reward_std": 0.2208447903394699, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548, + "rewards/curriculum_aware_reward_fn/std": 0.3716694116592407, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1446.0, + "completions/max_terminated_length": 1446.0, + "completions/mean_length": 366.8883972167969, + "completions/mean_terminated_length": 366.8883972167969, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.6518824136152656, + "grad_norm": 0.23085784912109375, + "kl": 0.01419830322265625, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 67971421.0, + "reward": 1.1473214626312256, + "reward_std": 0.17780883610248566, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1530.0, + "completions/max_terminated_length": 1530.0, + "completions/mean_length": 374.72991943359375, + "completions/mean_terminated_length": 374.72991943359375, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.6560082516761218, + "grad_norm": 0.2072339653968811, + "kl": 0.01485443115234375, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 68419713.0, + "reward": 1.1607143878936768, + "reward_std": 0.156369149684906, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1529.0, + "completions/max_terminated_length": 1529.0, + "completions/mean_length": 391.3415222167969, + "completions/mean_terminated_length": 391.3415222167969, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.6601340897369778, + "grad_norm": 0.22335219383239746, + "kl": 0.018585205078125, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 68894900.0, + "reward": 1.1629464626312256, + "reward_std": 0.20225736498832703, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, + "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1236.0, + "completions/max_terminated_length": 1236.0, + "completions/mean_length": 385.0826110839844, + "completions/mean_terminated_length": 385.0826110839844, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.6642599277978339, + "grad_norm": 0.22281871736049652, + "kl": 0.01507568359375, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 69340053.0, + "reward": 1.1674107313156128, + "reward_std": 0.19550350308418274, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452, + "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 944.0, + "completions/max_terminated_length": 944.0, + "completions/mean_length": 363.0000305175781, + "completions/mean_terminated_length": 363.0000305175781, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.66838576585869, + "grad_norm": 0.23224228620529175, + "kl": 0.0154876708984375, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 69771568.0, + "reward": 1.1361607313156128, + "reward_std": 0.20033742487430573, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1383928507566452, + "rewards/curriculum_aware_reward_fn/std": 0.34569787979125977, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1050.0, + "completions/max_terminated_length": 1050.0, + "completions/mean_length": 361.09600830078125, + "completions/mean_terminated_length": 361.09600830078125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.6725116039195461, + "grad_norm": 0.24737422168254852, + "kl": 0.01534271240234375, + "learning_rate": 1e-06, + "loss": -0.0084, + "num_tokens": 70200179.0, + "reward": 1.1629464626312256, + "reward_std": 0.22474639117717743, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968, + "rewards/curriculum_aware_reward_fn/std": 0.37175676226615906, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1104.0, + "completions/max_terminated_length": 1104.0, + "completions/mean_length": 351.03350830078125, + "completions/mean_terminated_length": 351.03350830078125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.6766374419804023, + "grad_norm": 0.2552102208137512, + "kl": 0.01573944091796875, + "learning_rate": 1e-06, + "loss": 0.021, + "num_tokens": 70628041.0, + "reward": 1.1450893878936768, + "reward_std": 0.1989629715681076, + "rewards/code_format_reward/mean": 0.9888392686843872, + "rewards/code_format_reward/std": 0.10517053306102753, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.36349809169769287, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 981.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 365.0535888671875, + "completions/mean_terminated_length": 365.0535888671875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.6807632800412584, + "grad_norm": 0.2395404428243637, + "kl": 0.0143280029296875, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 71071298.0, + "reward": 1.1875, + "reward_std": 0.22171072661876678, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3960230052471161, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 957.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 362.4844055175781, + "completions/mean_terminated_length": 362.4844055175781, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.6848891181021145, + "grad_norm": 0.21977753937244415, + "kl": 0.01514434814453125, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 71518906.0, + "reward": 1.149553656578064, + "reward_std": 0.1930348426103592, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452, + "rewards/curriculum_aware_reward_fn/std": 0.36136940121650696, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 342.22991943359375, + "completions/mean_terminated_length": 342.22991943359375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.6890149561629706, + "grad_norm": 0.2827470004558563, + "kl": 0.015411376953125, + "learning_rate": 1e-06, + "loss": 0.0304, + "num_tokens": 71938603.0, + "reward": 1.1941964626312256, + "reward_std": 0.26317548751831055, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, + "rewards/curriculum_aware_reward_fn/std": 0.4088349938392639, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1074.0, + "completions/max_terminated_length": 1074.0, + "completions/mean_length": 384.7901916503906, + "completions/mean_terminated_length": 384.7901916503906, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.6931407942238267, + "grad_norm": 0.23986679315567017, + "kl": 0.015289306640625, + "learning_rate": 1e-06, + "loss": 0.014, + "num_tokens": 72401657.0, + "reward": 1.0982143878936768, + "reward_std": 0.18790170550346375, + "rewards/code_format_reward/mean": 0.9888392686843872, + "rewards/code_format_reward/std": 0.10517053306102753, + "rewards/curriculum_aware_reward_fn/mean": 0.109375, + "rewards/curriculum_aware_reward_fn/std": 0.3124580383300781, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1182.0, + "completions/max_terminated_length": 1182.0, + "completions/mean_length": 366.13616943359375, + "completions/mean_terminated_length": 366.13616943359375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.6972666322846828, + "grad_norm": 0.2206612080335617, + "kl": 0.013885498046875, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 72851139.0, + "reward": 1.171875, + "reward_std": 0.1650066077709198, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.171875, + "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1107.0, + "completions/max_terminated_length": 1107.0, + "completions/mean_length": 361.0625305175781, + "completions/mean_terminated_length": 361.0625305175781, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.701392470345539, + "grad_norm": 0.19477325677871704, + "kl": 0.01851654052734375, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 73305089.0, + "reward": 1.15625, + "reward_std": 0.1615247130393982, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548, + "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1212.0, + "completions/max_terminated_length": 1212.0, + "completions/mean_length": 367.2544860839844, + "completions/mean_terminated_length": 367.2544860839844, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.7055183084063951, + "grad_norm": 0.25459757447242737, + "kl": 0.0151519775390625, + "learning_rate": 1e-06, + "loss": -0.0083, + "num_tokens": 73743438.0, + "reward": 1.149553656578064, + "reward_std": 0.19232958555221558, + "rewards/code_format_reward/mean": 0.9888392686843872, + "rewards/code_format_reward/std": 0.10517053306102753, + "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3676777780056, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 943.0, + "completions/max_terminated_length": 943.0, + "completions/mean_length": 346.82366943359375, + "completions/mean_terminated_length": 346.82366943359375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.7096441464672512, + "grad_norm": 0.26766178011894226, + "kl": 0.0182037353515625, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 74159257.0, + "reward": 1.1629464626312256, + "reward_std": 0.24092715978622437, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.171875, + "rewards/curriculum_aware_reward_fn/std": 0.38935965299606323, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 843.0, + "completions/max_terminated_length": 843.0, + "completions/mean_length": 340.3258972167969, + "completions/mean_terminated_length": 340.3258972167969, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.7137699845281072, + "grad_norm": 0.25564658641815186, + "kl": 0.0155181884765625, + "learning_rate": 1e-06, + "loss": 0.014, + "num_tokens": 74568644.0, + "reward": 1.1741071939468384, + "reward_std": 0.2240411341190338, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3834212124347687, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1173.0, + "completions/max_terminated_length": 1173.0, + "completions/mean_length": 322.3035888671875, + "completions/mean_terminated_length": 322.3035888671875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.7178958225889633, + "grad_norm": 0.2987499535083771, + "kl": 0.02458953857421875, + "learning_rate": 1e-06, + "loss": 0.02, + "num_tokens": 74979603.0, + "reward": 1.2053571939468384, + "reward_std": 0.27893462777137756, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.2120535671710968, + "rewards/curriculum_aware_reward_fn/std": 0.4146503508090973, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1080.0, + "completions/mean_length": 333.75, + "completions/mean_terminated_length": 325.3333435058594, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.7220216606498195, + "grad_norm": 0.2896713316440582, + "kl": 0.0155487060546875, + "learning_rate": 1e-06, + "loss": 0.0192, + "num_tokens": 75427687.0, + "reward": 1.1741071939468384, + "reward_std": 0.23012901842594147, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843171834946, + "rewards/curriculum_aware_reward_fn/mean": 0.1808035671710968, + "rewards/curriculum_aware_reward_fn/std": 0.3852855861186981, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 333.93304443359375, + "completions/mean_terminated_length": 333.93304443359375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.7261474987106756, + "grad_norm": 0.26385900378227234, + "kl": 0.01572418212890625, + "learning_rate": 1e-06, + "loss": 0.0158, + "num_tokens": 75846166.0, + "reward": 1.180803656578064, + "reward_std": 0.24136929214000702, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516, + "rewards/curriculum_aware_reward_fn/std": 0.387128084897995, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1144.0, + "completions/max_terminated_length": 1144.0, + "completions/mean_length": 338.9598388671875, + "completions/mean_terminated_length": 338.9598388671875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.7302733367715317, + "grad_norm": 0.2855651080608368, + "kl": 0.01647186279296875, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 76266400.0, + "reward": 1.1629464626312256, + "reward_std": 0.24424399435520172, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349845170975, + "rewards/curriculum_aware_reward_fn/mean": 0.171875, + "rewards/curriculum_aware_reward_fn/std": 0.40068626403808594, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1064.0, + "completions/max_terminated_length": 1064.0, + "completions/mean_length": 311.58929443359375, + "completions/mean_terminated_length": 311.58929443359375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.7343991748323878, + "grad_norm": 0.26801854372024536, + "kl": 0.02155303955078125, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 76686384.0, + "reward": 1.1852679252624512, + "reward_std": 0.2134612649679184, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548, + "rewards/curriculum_aware_reward_fn/std": 0.39252743124961853, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 813.0, + "completions/max_terminated_length": 813.0, + "completions/mean_length": 304.47991943359375, + "completions/mean_terminated_length": 304.47991943359375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.7385250128932439, + "grad_norm": 0.35143211483955383, + "kl": 0.03144073486328125, + "learning_rate": 1e-06, + "loss": 0.0237, + "num_tokens": 77071872.0, + "reward": 1.2566964626312256, + "reward_std": 0.30517578125, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.2633928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.44096609950065613, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1169.0, + "completions/max_terminated_length": 1169.0, + "completions/mean_length": 341.2544860839844, + "completions/mean_terminated_length": 341.2544860839844, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.7426508509541001, + "grad_norm": 0.24900677800178528, + "kl": 0.01654052734375, + "learning_rate": 1e-06, + "loss": 0.0226, + "num_tokens": 77484440.0, + "reward": 1.1473214626312256, + "reward_std": 0.21222974359989166, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, + "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1099.0, + "completions/max_terminated_length": 1099.0, + "completions/mean_length": 332.39288330078125, + "completions/mean_terminated_length": 332.39288330078125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.7467766890149562, + "grad_norm": 0.2445136457681656, + "kl": 0.01773834228515625, + "learning_rate": 1e-06, + "loss": 0.0204, + "num_tokens": 77914906.0, + "reward": 1.165178656578064, + "reward_std": 0.2167556881904602, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.171875, + "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1067.0, + "completions/max_terminated_length": 1067.0, + "completions/mean_length": 318.69866943359375, + "completions/mean_terminated_length": 318.69866943359375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.7509025270758123, + "grad_norm": 0.23836004734039307, + "kl": 0.023773193359375, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 78312847.0, + "reward": 1.1473214626312256, + "reward_std": 0.18241570889949799, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, + "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 767.0, + "completions/max_terminated_length": 767.0, + "completions/mean_length": 301.0245666503906, + "completions/mean_terminated_length": 301.0245666503906, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.7550283651366684, + "grad_norm": 0.2632372975349426, + "kl": 0.0267791748046875, + "learning_rate": 1e-06, + "loss": 0.013, + "num_tokens": 78707133.0, + "reward": 1.1584821939468384, + "reward_std": 0.18409815430641174, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 315.3102722167969, + "completions/mean_terminated_length": 315.3102722167969, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.7591542031975245, + "grad_norm": 0.25678277015686035, + "kl": 0.0182647705078125, + "learning_rate": 1e-06, + "loss": 0.0068, + "num_tokens": 79133075.0, + "reward": 1.1741071939468384, + "reward_std": 0.1857805848121643, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3834211826324463, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1141.0, + "completions/max_terminated_length": 1141.0, + "completions/mean_length": 314.8348388671875, + "completions/mean_terminated_length": 314.8348388671875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.7632800412583806, + "grad_norm": 0.26571905612945557, + "kl": 0.018341064453125, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 79543950.0, + "reward": 1.1383929252624512, + "reward_std": 0.21120063960552216, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484, + "rewards/curriculum_aware_reward_fn/std": 0.352584570646286, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1608.0, + "completions/max_terminated_length": 1608.0, + "completions/mean_length": 296.62725830078125, + "completions/mean_terminated_length": 296.62725830078125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.7674058793192368, + "grad_norm": 0.3173041343688965, + "kl": 0.0209197998046875, + "learning_rate": 1e-06, + "loss": 0.03, + "num_tokens": 79936861.0, + "reward": 1.2388393878936768, + "reward_std": 0.26434528827667236, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.2410714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.43853598833084106, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 306.97100830078125, + "completions/mean_terminated_length": 306.97100830078125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.7715317173800929, + "grad_norm": 0.2590916156768799, + "kl": 0.0192108154296875, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 80341945.0, + "reward": 1.1227679252624512, + "reward_std": 0.2021593153476715, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 697.0, + "completions/max_terminated_length": 697.0, + "completions/mean_length": 298.37054443359375, + "completions/mean_terminated_length": 298.37054443359375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.7756575554409489, + "grad_norm": 0.26516759395599365, + "kl": 0.021209716796875, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 80738444.0, + "reward": 1.1227679252624512, + "reward_std": 0.20725637674331665, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548, + "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 807.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 292.6004638671875, + "completions/mean_terminated_length": 292.6004638671875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.779783393501805, + "grad_norm": 0.30099207162857056, + "kl": 0.019683837890625, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 81127619.0, + "reward": 1.234375, + "reward_std": 0.2714380919933319, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843171834946, + "rewards/curriculum_aware_reward_fn/mean": 0.2410714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.42821168899536133, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 284.65850830078125, + "completions/mean_terminated_length": 284.65850830078125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.7839092315626611, + "grad_norm": 0.2730914354324341, + "kl": 0.0259552001953125, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 81507881.0, + "reward": 1.1897321939468384, + "reward_std": 0.23237062990665436, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3960230052471161, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 295.046875, + "completions/mean_terminated_length": 295.046875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.7880350696235173, + "grad_norm": 0.2663305997848511, + "kl": 0.021148681640625, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 81889192.0, + "reward": 1.2075893878936768, + "reward_std": 0.1989629715681076, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.2075892835855484, + "rewards/curriculum_aware_reward_fn/std": 0.4060344398021698, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1043.0, + "completions/max_terminated_length": 1043.0, + "completions/mean_length": 313.72100830078125, + "completions/mean_terminated_length": 313.72100830078125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.7921609076843734, + "grad_norm": 0.2648645341396332, + "kl": 0.0235595703125, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 82322752.0, + "reward": 1.1540179252624512, + "reward_std": 0.20486865937709808, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.36349809169769287, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 853.0, + "completions/max_terminated_length": 853.0, + "completions/mean_length": 295.3504638671875, + "completions/mean_terminated_length": 295.3504638671875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.7962867457452295, + "grad_norm": 0.2970978021621704, + "kl": 0.0200653076171875, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 82728454.0, + "reward": 1.196428656578064, + "reward_std": 0.23176342248916626, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843171834946, + "rewards/curriculum_aware_reward_fn/mean": 0.203125, + "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 310.0401916503906, + "completions/mean_terminated_length": 310.0401916503906, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.8004125838060856, + "grad_norm": 0.2468232959508896, + "kl": 0.0216064453125, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 83131742.0, + "reward": 1.1875, + "reward_std": 0.20672789216041565, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3942854404449463, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 301.2633972167969, + "completions/mean_terminated_length": 301.2633972167969, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.8045384218669417, + "grad_norm": 0.28781455755233765, + "kl": 0.02532958984375, + "learning_rate": 1e-06, + "loss": -0.0154, + "num_tokens": 83525491.0, + "reward": 1.1830357313156128, + "reward_std": 0.21340614557266235, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1875, + "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 1114.0, + "completions/mean_length": 298.15179443359375, + "completions/mean_terminated_length": 298.15179443359375, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.8086642599277978, + "grad_norm": 0.25646495819091797, + "kl": 0.0185699462890625, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 83930551.0, + "reward": 1.2008929252624512, + "reward_std": 0.21293501555919647, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.203125, + "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1275.0, + "completions/max_terminated_length": 1275.0, + "completions/mean_length": 308.0401916503906, + "completions/mean_terminated_length": 308.0401916503906, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.812790097988654, + "grad_norm": 0.2715901732444763, + "kl": 0.0261383056640625, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 84335582.0, + "reward": 1.1852679252624512, + "reward_std": 0.23676243424415588, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1875, + "rewards/curriculum_aware_reward_fn/std": 0.41839686036109924, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 898.0, + "completions/max_terminated_length": 898.0, + "completions/mean_length": 288.3482360839844, + "completions/mean_terminated_length": 288.3482360839844, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.8169159360495101, + "grad_norm": 0.2405804991722107, + "kl": 0.0204620361328125, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 84718631.0, + "reward": 1.1986607313156128, + "reward_std": 0.1866028755903244, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1986607164144516, + "rewards/curriculum_aware_reward_fn/std": 0.3994380533695221, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 278.7901916503906, + "completions/mean_terminated_length": 278.7901916503906, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.8210417741103662, + "grad_norm": 0.30192846059799194, + "kl": 0.024627685546875, + "learning_rate": 1e-06, + "loss": 0.0157, + "num_tokens": 85102669.0, + "reward": 1.1897321939468384, + "reward_std": 0.2407066822052002, + "rewards/code_format_reward/mean": 0.9888392686843872, + "rewards/code_format_reward/std": 0.10517053306102753, + "rewards/curriculum_aware_reward_fn/mean": 0.2008928507566452, + "rewards/curriculum_aware_reward_fn/std": 0.4011160135269165, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 770.0, + "completions/max_terminated_length": 770.0, + "completions/mean_length": 292.1763610839844, + "completions/mean_terminated_length": 292.1763610839844, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.8251676121712223, + "grad_norm": 0.284535676240921, + "kl": 0.0212249755859375, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 85515524.0, + "reward": 1.1852679252624512, + "reward_std": 0.2346602827310562, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548, + "rewards/curriculum_aware_reward_fn/std": 0.39252743124961853, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1026.0, + "completions/max_terminated_length": 1026.0, + "completions/mean_length": 276.66741943359375, + "completions/mean_terminated_length": 276.66741943359375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.8292934502320783, + "grad_norm": 0.2965720593929291, + "kl": 0.0219879150390625, + "learning_rate": 1e-06, + "loss": 0.0276, + "num_tokens": 85908173.0, + "reward": 1.2165179252624512, + "reward_std": 0.2559392750263214, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.2209821492433548, + "rewards/curriculum_aware_reward_fn/std": 0.4153723120689392, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 834.0, + "completions/max_terminated_length": 834.0, + "completions/mean_length": 283.140625, + "completions/mean_terminated_length": 283.140625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.8334192882929345, + "grad_norm": 0.2588541507720947, + "kl": 0.0260009765625, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 86294237.0, + "reward": 1.15625, + "reward_std": 0.2073148787021637, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.15625, + "rewards/curriculum_aware_reward_fn/std": 0.36349809169769287, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 295.6540222167969, + "completions/mean_terminated_length": 295.6540222167969, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.8375451263537906, + "grad_norm": 0.2303047776222229, + "kl": 0.0272064208984375, + "learning_rate": 1e-06, + "loss": 0.0114, + "num_tokens": 86708692.0, + "reward": 1.1227679252624512, + "reward_std": 0.17258599400520325, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.125, + "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 736.0, + "completions/max_terminated_length": 736.0, + "completions/mean_length": 268.2276916503906, + "completions/mean_terminated_length": 268.2276916503906, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.8416709644146467, + "grad_norm": 0.28269636631011963, + "kl": 0.0269012451171875, + "learning_rate": 1e-06, + "loss": 0.0143, + "num_tokens": 87093689.0, + "reward": 1.1674107313156128, + "reward_std": 0.18484607338905334, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.171875, + "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1485.0, + "completions/max_terminated_length": 1485.0, + "completions/mean_length": 270.5669860839844, + "completions/mean_terminated_length": 270.5669860839844, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.8457968024755028, + "grad_norm": 0.3281504511833191, + "kl": 0.0283966064453125, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 87479685.0, + "reward": 1.2209821939468384, + "reward_std": 0.28913792967796326, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, + "rewards/curriculum_aware_reward_fn/std": 0.4221988022327423, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 878.0, + "completions/max_terminated_length": 878.0, + "completions/mean_length": 264.1942138671875, + "completions/mean_terminated_length": 264.1942138671875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.8499226405363589, + "grad_norm": 0.28296637535095215, + "kl": 0.03045654296875, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 87859158.0, + "reward": 1.1629464626312256, + "reward_std": 0.2186012715101242, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452, + "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1376.0, + "completions/max_terminated_length": 1376.0, + "completions/mean_length": 257.7857360839844, + "completions/mean_terminated_length": 257.7857360839844, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.8540484785972151, + "grad_norm": 0.325296550989151, + "kl": 0.0262603759765625, + "learning_rate": 1e-06, + "loss": 0.0146, + "num_tokens": 88231364.0, + "reward": 1.2254464626312256, + "reward_std": 0.2650907635688782, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.2299107164144516, + "rewards/curriculum_aware_reward_fn/std": 0.42124560475349426, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 753.0, + "completions/max_terminated_length": 753.0, + "completions/mean_length": 266.5379638671875, + "completions/mean_terminated_length": 266.5379638671875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.8581743166580712, + "grad_norm": 0.2887232303619385, + "kl": 0.0287933349609375, + "learning_rate": 1e-06, + "loss": 0.0168, + "num_tokens": 88611985.0, + "reward": 1.2477679252624512, + "reward_std": 0.24733349680900574, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.25, + "rewards/curriculum_aware_reward_fn/std": 0.43349677324295044, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 274.9933166503906, + "completions/mean_terminated_length": 266.4451904296875, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.8623001547189273, + "grad_norm": 0.2456807792186737, + "kl": 0.0304107666015625, + "learning_rate": 1e-06, + "loss": 0.0256, + "num_tokens": 89014062.0, + "reward": 1.15625, + "reward_std": 0.20821687579154968, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843171834946, + "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3697296380996704, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 269.3727722167969, + "completions/mean_terminated_length": 260.81207275390625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.8664259927797834, + "grad_norm": 0.311091810464859, + "kl": 0.0285797119140625, + "learning_rate": 1e-06, + "loss": 0.034, + "num_tokens": 89401434.0, + "reward": 1.2299107313156128, + "reward_std": 0.251747727394104, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.2366071492433548, + "rewards/curriculum_aware_reward_fn/std": 0.4254741966724396, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 538.0, + "completions/max_terminated_length": 538.0, + "completions/mean_length": 240.2366180419922, + "completions/mean_terminated_length": 240.2366180419922, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.8705518308406395, + "grad_norm": 0.29727450013160706, + "kl": 0.0320892333984375, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 89763790.0, + "reward": 1.1473214626312256, + "reward_std": 0.20072634518146515, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968, + "rewards/curriculum_aware_reward_fn/std": 0.36324387788772583, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 252.15179443359375, + "completions/mean_terminated_length": 252.15179443359375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.8746776689014956, + "grad_norm": 0.30429133772850037, + "kl": 0.0283050537109375, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 90140269.0, + "reward": 1.1941964626312256, + "reward_std": 0.2299019694328308, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968, + "rewards/curriculum_aware_reward_fn/std": 0.39774051308631897, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 953.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 262.7433166503906, + "completions/mean_terminated_length": 262.7433166503906, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.8788035069623518, + "grad_norm": 0.33080312609672546, + "kl": 0.0282135009765625, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 90529879.0, + "reward": 1.2388393878936768, + "reward_std": 0.2763974070549011, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.2410714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.42821168899536133, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 835.0, + "completions/max_terminated_length": 835.0, + "completions/mean_length": 266.51788330078125, + "completions/mean_terminated_length": 266.51788330078125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.8829293450232079, + "grad_norm": 0.2860460877418518, + "kl": 0.026336669921875, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 90929384.0, + "reward": 1.2142857313156128, + "reward_std": 0.24735590815544128, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.21875, + "rewards/curriculum_aware_reward_fn/std": 0.4138607978820801, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 262.37725830078125, + "completions/mean_terminated_length": 262.37725830078125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.887055183084064, + "grad_norm": 0.30841729044914246, + "kl": 0.027740478515625, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 91306111.0, + "reward": 1.1897321939468384, + "reward_std": 0.23583011329174042, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3942854106426239, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 256.8482360839844, + "completions/mean_terminated_length": 256.8482360839844, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.89118102114492, + "grad_norm": 11.453292846679688, + "kl": 0.8353729248046875, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 91690343.0, + "reward": 1.196428656578064, + "reward_std": 0.23445691168308258, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.2053571492433548, + "rewards/curriculum_aware_reward_fn/std": 0.40441396832466125, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 260.1651916503906, + "completions/mean_terminated_length": 260.1651916503906, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.8953068592057761, + "grad_norm": 0.2754879891872406, + "kl": 0.0255889892578125, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 92069019.0, + "reward": 1.1629464626312256, + "reward_std": 0.20804257690906525, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, + "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 688.0, + "completions/max_terminated_length": 688.0, + "completions/mean_length": 244.32366943359375, + "completions/mean_terminated_length": 244.32366943359375, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.8994326972666323, + "grad_norm": 0.31076574325561523, + "kl": 0.0318450927734375, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 92435873.0, + "reward": 1.2254464626312256, + "reward_std": 0.2242594063282013, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.2276785671710968, + "rewards/curriculum_aware_reward_fn/std": 0.41980281472206116, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1105.0, + "completions/max_terminated_length": 1105.0, + "completions/mean_length": 259.37054443359375, + "completions/mean_terminated_length": 259.37054443359375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.9035585353274884, + "grad_norm": 0.3404967188835144, + "kl": 0.029510498046875, + "learning_rate": 1e-06, + "loss": -0.0106, + "num_tokens": 92822418.0, + "reward": 1.2098214626312256, + "reward_std": 0.26485005021095276, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452, + "rewards/curriculum_aware_reward_fn/std": 0.4177219867706299, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1197.0, + "completions/mean_length": 266.6473388671875, + "completions/mean_terminated_length": 258.0805358886719, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.9076843733883445, + "grad_norm": 0.32294103503227234, + "kl": 0.0294342041015625, + "learning_rate": 1e-06, + "loss": 0.0281, + "num_tokens": 93196529.0, + "reward": 1.2544643878936768, + "reward_std": 0.270959734916687, + "rewards/code_format_reward/mean": 0.9799107313156128, + "rewards/code_format_reward/std": 0.14046260714530945, + "rewards/curriculum_aware_reward_fn/mean": 0.2745535671710968, + "rewards/curriculum_aware_reward_fn/std": 0.4467879831790924, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 262.7098388671875, + "completions/mean_terminated_length": 262.7098388671875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.9118102114492006, + "grad_norm": 0.25437629222869873, + "kl": 0.0256500244140625, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 93565943.0, + "reward": 1.21875, + "reward_std": 0.1944902390241623, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.2209821492433548, + "rewards/curriculum_aware_reward_fn/std": 0.4153723120689392, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 261.0044860839844, + "completions/mean_terminated_length": 261.0044860839844, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.9159360495100567, + "grad_norm": 0.2887217104434967, + "kl": 0.032867431640625, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 93946353.0, + "reward": 1.1629464626312256, + "reward_std": 0.2028724104166031, + "rewards/code_format_reward/mean": 0.984375, + "rewards/code_format_reward/std": 0.12415824085474014, + "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3834212124347687, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1376.0, + "completions/max_terminated_length": 1376.0, + "completions/mean_length": 297.7790222167969, + "completions/mean_terminated_length": 297.7790222167969, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.9200618875709129, + "grad_norm": 0.31769412755966187, + "kl": 0.024017333984375, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 94344953.0, + "reward": 1.1473214626312256, + "reward_std": 0.2656678259372711, + "rewards/code_format_reward/mean": 0.9732142686843872, + "rewards/code_format_reward/std": 0.1616371124982834, + "rewards/curriculum_aware_reward_fn/mean": 0.1741071492433548, + "rewards/curriculum_aware_reward_fn/std": 0.39691102504730225, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 873.0, + "completions/max_terminated_length": 873.0, + "completions/mean_length": 299.4977722167969, + "completions/mean_terminated_length": 299.4977722167969, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.924187725631769, + "grad_norm": 0.30828657746315, + "kl": 0.0286712646484375, + "learning_rate": 1e-06, + "loss": 0.0251, + "num_tokens": 94757014.0, + "reward": 1.1473214626312256, + "reward_std": 0.26786863803863525, + "rewards/code_format_reward/mean": 0.9598214030265808, + "rewards/code_format_reward/std": 0.1965973675251007, + "rewards/curriculum_aware_reward_fn/mean": 0.1875, + "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2189.0, + "completions/max_terminated_length": 2189.0, + "completions/mean_length": 278.9576110839844, + "completions/mean_terminated_length": 278.9576110839844, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.9283135636926251, + "grad_norm": 0.3763558864593506, + "kl": 0.026763916015625, + "learning_rate": 1e-06, + "loss": 0.016, + "num_tokens": 95150605.0, + "reward": 1.1763393878936768, + "reward_std": 0.3294941782951355, + "rewards/code_format_reward/mean": 0.9464285969734192, + "rewards/code_format_reward/std": 0.2254217267036438, + "rewards/curriculum_aware_reward_fn/mean": 0.2299107164144516, + "rewards/curriculum_aware_reward_fn/std": 0.42124560475349426, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 283.1785888671875, + "completions/mean_terminated_length": 283.1785888671875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.9324394017534812, + "grad_norm": 0.37017151713371277, + "kl": 0.0257415771484375, + "learning_rate": 1e-06, + "loss": 0.0236, + "num_tokens": 95541054.0, + "reward": 1.1696429252624512, + "reward_std": 0.2892906069755554, + "rewards/code_format_reward/mean": 0.953125, + "rewards/code_format_reward/std": 0.21160738170146942, + "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452, + "rewards/curriculum_aware_reward_fn/std": 0.41233164072036743, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0022321428571429047, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1383.0, + "completions/mean_length": 281.75225830078125, + "completions/mean_terminated_length": 273.21923828125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.9365652398143373, + "grad_norm": 0.3290660083293915, + "kl": 0.0265350341796875, + "learning_rate": 1e-06, + "loss": 0.0388, + "num_tokens": 95929912.0, + "reward": 1.2388393878936768, + "reward_std": 0.27593371272087097, + "rewards/code_format_reward/mean": 0.9754464030265808, + "rewards/code_format_reward/std": 0.1549331247806549, + "rewards/curriculum_aware_reward_fn/mean": 0.2633928656578064, + "rewards/curriculum_aware_reward_fn/std": 0.44096609950065613, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1050.0, + "completions/max_terminated_length": 1050.0, + "completions/mean_length": 293.28350830078125, + "completions/mean_terminated_length": 293.28350830078125, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.9406910778751933, + "grad_norm": 0.300258994102478, + "kl": 0.0255126953125, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 96326386.0, + "reward": 1.1897321939468384, + "reward_std": 0.2706966996192932, + "rewards/code_format_reward/mean": 0.96875, + "rewards/code_format_reward/std": 0.17418713867664337, + "rewards/curriculum_aware_reward_fn/mean": 0.2209821492433548, + "rewards/curriculum_aware_reward_fn/std": 0.4153723120689392, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1864.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 316.2477722167969, + "completions/mean_terminated_length": 316.2477722167969, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.9448169159360496, + "grad_norm": 0.29400742053985596, + "kl": 0.0277099609375, + "learning_rate": 1e-06, + "loss": 0.0429, + "num_tokens": 96749164.0, + "reward": 1.1227679252624512, + "reward_std": 0.2612314522266388, + "rewards/code_format_reward/mean": 0.9620535969734192, + "rewards/code_format_reward/std": 0.19128035008907318, + "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484, + "rewards/curriculum_aware_reward_fn/std": 0.3676777780056, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 284.5535888671875, + "completions/mean_terminated_length": 284.5535888671875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.9489427539969056, + "grad_norm": 0.2488163560628891, + "kl": 0.029022216796875, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 97146046.0, + "reward": 1.1696429252624512, + "reward_std": 0.17548497021198273, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032, + "rewards/curriculum_aware_reward_fn/std": 0.3834212124347687, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 949.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 291.2165222167969, + "completions/mean_terminated_length": 291.2165222167969, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.9530685920577617, + "grad_norm": 0.2761755883693695, + "kl": 0.02569580078125, + "learning_rate": 1e-06, + "loss": 0.0327, + "num_tokens": 97538143.0, + "reward": 1.2165179252624512, + "reward_std": 0.2395087480545044, + "rewards/code_format_reward/mean": 0.9866071343421936, + "rewards/code_format_reward/std": 0.11507843434810638, + "rewards/curriculum_aware_reward_fn/mean": 0.2299107164144516, + "rewards/curriculum_aware_reward_fn/std": 0.42124560475349426, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 718.0, + "completions/max_terminated_length": 718.0, + "completions/mean_length": 279.234375, + "completions/mean_terminated_length": 279.234375, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.9571944301186178, + "grad_norm": 0.2993510961532593, + "kl": 0.0256805419921875, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 97929947.0, + "reward": 1.2165179252624512, + "reward_std": 0.2513192594051361, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, + "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 290.8169860839844, + "completions/mean_terminated_length": 290.8169860839844, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.9613202681794739, + "grad_norm": 0.2771032750606537, + "kl": 0.0251007080078125, + "learning_rate": 1e-06, + "loss": 0.0053, + "num_tokens": 98328032.0, + "reward": 1.1897321939468384, + "reward_std": 0.21872369945049286, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.1986607164144516, + "rewards/curriculum_aware_reward_fn/std": 0.3994380831718445, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1063.0, + "completions/max_terminated_length": 1063.0, + "completions/mean_length": 291.54241943359375, + "completions/mean_terminated_length": 291.54241943359375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.9654461062403301, + "grad_norm": 0.26633113622665405, + "kl": 0.02685546875, + "learning_rate": 1e-06, + "loss": 0.0203, + "num_tokens": 98730600.0, + "reward": 1.125, + "reward_std": 0.19034792482852936, + "rewards/code_format_reward/mean": 0.9933035969734192, + "rewards/code_format_reward/std": 0.08164843916893005, + "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1147.0, + "completions/max_terminated_length": 1147.0, + "completions/mean_length": 300.8326110839844, + "completions/mean_terminated_length": 300.8326110839844, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.9695719443011862, + "grad_norm": 0.2591470181941986, + "kl": 0.0254974365234375, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 99136343.0, + "reward": 1.2366071939468384, + "reward_std": 0.22157251834869385, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.2388392835855484, + "rewards/curriculum_aware_reward_fn/std": 0.43206024169921875, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 738.0, + "completions/max_terminated_length": 738.0, + "completions/mean_length": 285.0223388671875, + "completions/mean_terminated_length": 285.0223388671875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.9736977823620423, + "grad_norm": 0.2993071973323822, + "kl": 0.03564453125, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 99538489.0, + "reward": 1.1473214626312256, + "reward_std": 0.218577578663826, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516, + "rewards/curriculum_aware_reward_fn/std": 0.359214186668396, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1059.0, + "completions/max_terminated_length": 1059.0, + "completions/mean_length": 304.1651916503906, + "completions/mean_terminated_length": 304.1651916503906, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.9778236204228984, + "grad_norm": 0.25762394070625305, + "kl": 0.029205322265625, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 99931039.0, + "reward": 1.1830357313156128, + "reward_std": 0.22913821041584015, + "rewards/code_format_reward/mean": 0.9955357313156128, + "rewards/code_format_reward/std": 0.06674052774906158, + "rewards/curriculum_aware_reward_fn/mean": 0.1875, + "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 295.3348388671875, + "completions/mean_terminated_length": 295.3348388671875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.9819494584837545, + "grad_norm": 0.3165225386619568, + "kl": 0.02850341796875, + "learning_rate": 1e-06, + "loss": 0.012, + "num_tokens": 100339303.0, + "reward": 1.2142857313156128, + "reward_std": 0.27128201723098755, + "rewards/code_format_reward/mean": 0.9910714030265808, + "rewards/code_format_reward/std": 0.09417349100112915, + "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484, + "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3986.0, + "completions/max_terminated_length": 3986.0, + "completions/mean_length": 316.00225830078125, + "completions/mean_terminated_length": 316.00225830078125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.9860752965446106, + "grad_norm": 0.25554707646369934, + "kl": 0.02362060546875, + "learning_rate": 1e-06, + "loss": -0.0092, + "num_tokens": 100748125.0, + "reward": 1.1741071939468384, + "reward_std": 0.1868884414434433, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1741071492433548, + "rewards/curriculum_aware_reward_fn/std": 0.37962549924850464, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 827.0, + "completions/max_terminated_length": 827.0, + "completions/mean_length": 315.2321472167969, + "completions/mean_terminated_length": 315.2321472167969, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.9902011346054668, + "grad_norm": 0.4484083354473114, + "kl": 0.05804443359375, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 101162104.0, + "reward": 1.165178656578064, + "reward_std": 0.223335862159729, + "rewards/code_format_reward/mean": 0.9977678656578064, + "rewards/code_format_reward/std": 0.047245558351278305, + "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516, + "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 303.9665222167969, + "completions/mean_terminated_length": 303.9665222167969, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.9943269726663229, + "grad_norm": 0.21845290064811707, + "kl": 0.025726318359375, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 101574571.0, + "reward": 1.1316964626312256, + "reward_std": 0.15273067355155945, + "rewards/code_format_reward/mean": 1.0, + "rewards/code_format_reward/std": 0.0, + "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032, + "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 848.0, + "completions/max_terminated_length": 848.0, + "completions/mean_length": 299.19708251953125, + "completions/mean_terminated_length": 299.19708251953125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.998452810727179, + "grad_norm": 0.26789233088493347, + "kl": 0.0255889892578125, + "learning_rate": 1e-06, + "loss": 0.0175, + "num_tokens": 101989150.0, + "reward": 1.1674107313156128, + "reward_std": 0.21147961914539337, + "rewards/code_format_reward/mean": 0.984375, + "rewards/code_format_reward/std": 0.12415824085474014, + "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516, + "rewards/curriculum_aware_reward_fn/std": 0.387128084897995, + "step": 242 + }, + { + "epoch": 0.998452810727179, + "step": 242, + "total_flos": 0.0, + "train_loss": 0.012992730054614463, + "train_runtime": 16504.2621, + "train_samples_per_second": 0.94, + "train_steps_per_second": 0.015 + } + ], + "logging_steps": 1, + "max_steps": 242, + "num_input_tokens_seen": 101989150, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}