diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,6577 @@
+{
+  "best_global_step": null,
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.998452810727179,
+  "eval_steps": 500,
+  "global_step": 242,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2295.0,
+      "completions/max_terminated_length": 2295.0,
+      "completions/mean_length": 481.84600830078125,
+      "completions/mean_terminated_length": 481.84600830078125,
+      "completions/min_length": 3.0,
+      "completions/min_terminated_length": 3.0,
+      "epoch": 0.0041258380608561115,
+      "grad_norm": 0.24569128453731537,
+      "kl": 0.0,
+      "learning_rate": 1e-06,
+      "loss": 0.0069,
+      "num_tokens": 482686.0,
+      "reward": 0.1160714328289032,
+      "reward_std": 0.20019538700580597,
+      "rewards/code_format_reward/mean": 0.046875,
+      "rewards/code_format_reward/std": 0.21160738170146942,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0691964253783226,
+      "rewards/curriculum_aware_reward_fn/std": 0.2627292275428772,
+      "step": 1
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1400.0,
+      "completions/max_terminated_length": 1400.0,
+      "completions/mean_length": 481.9620666503906,
+      "completions/mean_terminated_length": 481.9620666503906,
+      "completions/min_length": 88.0,
+      "completions/min_terminated_length": 88.0,
+      "epoch": 0.008251676121712223,
+      "grad_norm": 0.24532559514045715,
+      "kl": 0.00029015541076660156,
+      "learning_rate": 1e-06,
+      "loss": 0.0088,
+      "num_tokens": 947945.0,
+      "reward": 0.122767873108387,
+      "reward_std": 0.20709191262722015,
+      "rewards/code_format_reward/mean": 0.0446428582072258,
+      "rewards/code_format_reward/std": 0.2067493349313736,
+      "rewards/curriculum_aware_reward_fn/mean": 0.078125,
+      "rewards/curriculum_aware_reward_fn/std": 0.26866820454597473,
+      "step": 2
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1337.0,
+      "completions/max_terminated_length": 1337.0,
+      "completions/mean_length": 478.7745666503906,
+      "completions/mean_terminated_length": 478.7745666503906,
+      "completions/min_length": 52.0,
+      "completions/min_terminated_length": 52.0,
+      "epoch": 0.012377514182568335,
+      "grad_norm": 0.303946316242218,
+      "kl": 0.0003151893615722656,
+      "learning_rate": 1e-06,
+      "loss": 0.02,
+      "num_tokens": 1414835.0,
+      "reward": 0.1473214328289032,
+      "reward_std": 0.270842045545578,
+      "rewards/code_format_reward/mean": 0.0758928582072258,
+      "rewards/code_format_reward/std": 0.265122652053833,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043,
+      "step": 3
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1680.0,
+      "completions/max_terminated_length": 1680.0,
+      "completions/mean_length": 491.4375305175781,
+      "completions/mean_terminated_length": 491.4375305175781,
+      "completions/min_length": 67.0,
+      "completions/min_terminated_length": 67.0,
+      "epoch": 0.016503352243424446,
+      "grad_norm": 0.27564534544944763,
+      "kl": 0.0003771781921386719,
+      "learning_rate": 1e-06,
+      "loss": 0.013,
+      "num_tokens": 1915020.0,
+      "reward": 0.1406250298023224,
+      "reward_std": 0.2626669406890869,
+      "rewards/code_format_reward/mean": 0.0647321417927742,
+      "rewards/code_format_reward/std": 0.24632768332958221,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0758928582072258,
+      "rewards/curriculum_aware_reward_fn/std": 0.265122652053833,
+      "step": 4
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1558.0,
+      "completions/max_terminated_length": 1558.0,
+      "completions/mean_length": 510.8504638671875,
+      "completions/mean_terminated_length": 510.8504638671875,
+      "completions/min_length": 26.0,
+      "completions/min_terminated_length": 26.0,
+      "epoch": 0.020629190304280558,
+      "grad_norm": 0.3501759469509125,
+      "kl": 0.0005273818969726562,
+      "learning_rate": 1e-06,
+      "loss": 0.0284,
+      "num_tokens": 2422824.0,
+      "reward": 0.2343750149011612,
+      "reward_std": 0.3528774082660675,
+      "rewards/code_format_reward/mean": 0.1361607164144516,
+      "rewards/code_format_reward/std": 0.34334251284599304,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0982142835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.29793688654899597,
+      "step": 5
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1621.0,
+      "completions/mean_length": 467.83038330078125,
+      "completions/mean_terminated_length": 459.7136535644531,
+      "completions/min_length": 13.0,
+      "completions/min_terminated_length": 13.0,
+      "epoch": 0.02475502836513667,
+      "grad_norm": 0.41125398874282837,
+      "kl": 0.0011577606201171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0271,
+      "num_tokens": 2907715.0,
+      "reward": 0.314732164144516,
+      "reward_std": 0.4204106032848358,
+      "rewards/code_format_reward/mean": 0.203125,
+      "rewards/code_format_reward/std": 0.4027745723724365,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1116071417927742,
+      "rewards/curriculum_aware_reward_fn/std": 0.31523454189300537,
+      "step": 6
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1866.0,
+      "completions/max_terminated_length": 1866.0,
+      "completions/mean_length": 464.36163330078125,
+      "completions/mean_terminated_length": 464.36163330078125,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "epoch": 0.02888086642599278,
+      "grad_norm": 0.4432518482208252,
+      "kl": 0.0016002655029296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0249,
+      "num_tokens": 3396491.0,
+      "reward": 0.3549107611179352,
+      "reward_std": 0.4484630227088928,
+      "rewards/code_format_reward/mean": 0.2700892984867096,
+      "rewards/code_format_reward/std": 0.444502055644989,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226,
+      "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478,
+      "step": 7
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2789.0,
+      "completions/max_terminated_length": 2789.0,
+      "completions/mean_length": 434.5513610839844,
+      "completions/mean_terminated_length": 434.5513610839844,
+      "completions/min_length": 90.0,
+      "completions/min_terminated_length": 90.0,
+      "epoch": 0.03300670448684889,
+      "grad_norm": 0.5083041191101074,
+      "kl": 0.0022182464599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0264,
+      "num_tokens": 3854503.0,
+      "reward": 0.4285714626312256,
+      "reward_std": 0.5326574444770813,
+      "rewards/code_format_reward/mean": 0.3571428656578064,
+      "rewards/code_format_reward/std": 0.47969308495521545,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043,
+      "step": 8
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1266.0,
+      "completions/max_terminated_length": 1266.0,
+      "completions/mean_length": 458.1339416503906,
+      "completions/mean_terminated_length": 458.1339416503906,
+      "completions/min_length": 48.0,
+      "completions/min_terminated_length": 48.0,
+      "epoch": 0.037132542547705004,
+      "grad_norm": 0.470951110124588,
+      "kl": 0.0035037994384765625,
+      "learning_rate": 1e-06,
+      "loss": -0.0005,
+      "num_tokens": 4321246.0,
+      "reward": 0.5022321939468384,
+      "reward_std": 0.5391286015510559,
+      "rewards/code_format_reward/mean": 0.4151785671710968,
+      "rewards/code_format_reward/std": 0.49330368638038635,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0870535746216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.2822286784648895,
+      "step": 9
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1260.0,
+      "completions/max_terminated_length": 1260.0,
+      "completions/mean_length": 413.63616943359375,
+      "completions/mean_terminated_length": 413.63616943359375,
+      "completions/min_length": 97.0,
+      "completions/min_terminated_length": 97.0,
+      "epoch": 0.041258380608561115,
+      "grad_norm": 0.43107807636260986,
+      "kl": 0.0074920654296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0132,
+      "num_tokens": 4777000.0,
+      "reward": 0.7700893878936768,
+      "reward_std": 0.5293206572532654,
+      "rewards/code_format_reward/mean": 0.6830357313156128,
+      "rewards/code_format_reward/std": 0.4658135175704956,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0870535746216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.2822286784648895,
+      "step": 10
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1706.0,
+      "completions/mean_length": 430.3906555175781,
+      "completions/mean_terminated_length": 422.1901550292969,
+      "completions/min_length": 125.0,
+      "completions/min_terminated_length": 125.0,
+      "epoch": 0.04538421866941723,
+      "grad_norm": 0.4257507026195526,
+      "kl": 0.00693511962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0281,
+      "num_tokens": 5234863.0,
+      "reward": 0.8683035969734192,
+      "reward_std": 0.4879590570926666,
+      "rewards/code_format_reward/mean": 0.7477678656578064,
+      "rewards/code_format_reward/std": 0.4347792863845825,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176,
+      "step": 11
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 3014.0,
+      "completions/mean_length": 438.3750305175781,
+      "completions/mean_terminated_length": 430.1923828125,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "epoch": 0.04951005673027334,
+      "grad_norm": 0.3997235596179962,
+      "kl": 0.007152557373046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0256,
+      "num_tokens": 5699194.0,
+      "reward": 0.8705357313156128,
+      "reward_std": 0.39619719982147217,
+      "rewards/code_format_reward/mean": 0.8102678656578064,
+      "rewards/code_format_reward/std": 0.39252740144729614,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0602678582072258,
+      "rewards/curriculum_aware_reward_fn/std": 0.23824848234653473,
+      "step": 12
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2370.0,
+      "completions/max_terminated_length": 2370.0,
+      "completions/mean_length": 409.2701110839844,
+      "completions/mean_terminated_length": 409.2701110839844,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "epoch": 0.05363589479112945,
+      "grad_norm": 0.39804381132125854,
+      "kl": 0.00933074951171875,
+      "learning_rate": 1e-06,
+      "loss": 0.024,
+      "num_tokens": 6151046.0,
+      "reward": 0.930803656578064,
+      "reward_std": 0.4214654862880707,
+      "rewards/code_format_reward/mean": 0.8214285969734192,
+      "rewards/code_format_reward/std": 0.3834212124347687,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3124580383300781,
+      "step": 13
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1497.0,
+      "completions/max_terminated_length": 1497.0,
+      "completions/mean_length": 403.5692138671875,
+      "completions/mean_terminated_length": 403.5692138671875,
+      "completions/min_length": 62.0,
+      "completions/min_terminated_length": 62.0,
+      "epoch": 0.05776173285198556,
+      "grad_norm": 0.372781902551651,
+      "kl": 0.0071258544921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0032,
+      "num_tokens": 6616948.0,
+      "reward": 0.9910714626312256,
+      "reward_std": 0.3275630474090576,
+      "rewards/code_format_reward/mean": 0.8727678656578064,
+      "rewards/code_format_reward/std": 0.3336053788661957,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1183035746216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.32332828640937805,
+      "step": 14
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1976.0,
+      "completions/max_terminated_length": 1976.0,
+      "completions/mean_length": 395.19866943359375,
+      "completions/mean_terminated_length": 395.19866943359375,
+      "completions/min_length": 84.0,
+      "completions/min_terminated_length": 84.0,
+      "epoch": 0.06188757091284167,
+      "grad_norm": 0.35227906703948975,
+      "kl": 0.007076263427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0116,
+      "num_tokens": 7048573.0,
+      "reward": 1.0290179252624512,
+      "reward_std": 0.3685193955898285,
+      "rewards/code_format_reward/mean": 0.8816964030265808,
+      "rewards/code_format_reward/std": 0.32332828640937805,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437,
+      "step": 15
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1582.0,
+      "completions/max_terminated_length": 1582.0,
+      "completions/mean_length": 401.7633972167969,
+      "completions/mean_terminated_length": 401.7633972167969,
+      "completions/min_length": 113.0,
+      "completions/min_terminated_length": 113.0,
+      "epoch": 0.06601340897369778,
+      "grad_norm": 0.37148913741111755,
+      "kl": 0.009387969970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0612,
+      "num_tokens": 7513155.0,
+      "reward": 1.0022321939468384,
+      "reward_std": 0.335994154214859,
+      "rewards/code_format_reward/mean": 0.90625,
+      "rewards/code_format_reward/std": 0.2918064594268799,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0959821417927742,
+      "rewards/curriculum_aware_reward_fn/std": 0.29489606618881226,
+      "step": 16
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2088.0,
+      "completions/max_terminated_length": 2088.0,
+      "completions/mean_length": 400.2812805175781,
+      "completions/mean_terminated_length": 400.2812805175781,
+      "completions/min_length": 127.0,
+      "completions/min_terminated_length": 127.0,
+      "epoch": 0.07013924703455389,
+      "grad_norm": 0.3265136480331421,
+      "kl": 0.01309967041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0591,
+      "num_tokens": 7951544.0,
+      "reward": 1.0267857313156128,
+      "reward_std": 0.2912137508392334,
+      "rewards/code_format_reward/mean": 0.9375,
+      "rewards/code_format_reward/std": 0.24233205616474152,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0892857164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.30073946714401245,
+      "step": 17
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1343.0,
+      "completions/max_terminated_length": 1343.0,
+      "completions/mean_length": 367.97100830078125,
+      "completions/mean_terminated_length": 367.97100830078125,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 0.07426508509541001,
+      "grad_norm": 0.2556539475917816,
+      "kl": 0.009555816650390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0147,
+      "num_tokens": 8385776.0,
+      "reward": 1.0982143878936768,
+      "reward_std": 0.20816494524478912,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1616371124982834,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996,
+      "step": 18
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1695.0,
+      "completions/max_terminated_length": 1695.0,
+      "completions/mean_length": 390.2969055175781,
+      "completions/mean_terminated_length": 390.2969055175781,
+      "completions/min_length": 85.0,
+      "completions/min_terminated_length": 85.0,
+      "epoch": 0.07839092315626611,
+      "grad_norm": 0.2662300765514374,
+      "kl": 0.0101318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.019,
+      "num_tokens": 8851108.0,
+      "reward": 1.0669643878936768,
+      "reward_std": 0.18616731464862823,
+      "rewards/code_format_reward/mean": 0.9776785969734192,
+      "rewards/code_format_reward/std": 0.1478918492794037,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0892857164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.2854744791984558,
+      "step": 19
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2309.0,
+      "completions/max_terminated_length": 2309.0,
+      "completions/mean_length": 379.9040222167969,
+      "completions/mean_terminated_length": 379.9040222167969,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "epoch": 0.08251676121712223,
+      "grad_norm": 0.23065051436424255,
+      "kl": 0.01053619384765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0092,
+      "num_tokens": 9290490.0,
+      "reward": 1.118303656578064,
+      "reward_std": 0.17734426259994507,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996,
+      "step": 20
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1933.0,
+      "completions/max_terminated_length": 1933.0,
+      "completions/mean_length": 356.87054443359375,
+      "completions/mean_terminated_length": 356.87054443359375,
+      "completions/min_length": 131.0,
+      "completions/min_terminated_length": 131.0,
+      "epoch": 0.08664259927797834,
+      "grad_norm": 0.625492513179779,
+      "kl": 0.04799652099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0243,
+      "num_tokens": 9711349.0,
+      "reward": 1.078125,
+      "reward_std": 0.15985102951526642,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843171834946,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226,
+      "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478,
+      "step": 21
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1136.0,
+      "completions/max_terminated_length": 1136.0,
+      "completions/mean_length": 349.20538330078125,
+      "completions/mean_terminated_length": 349.20538330078125,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "epoch": 0.09076843733883445,
+      "grad_norm": 0.21386803686618805,
+      "kl": 0.0127105712890625,
+      "learning_rate": 1e-06,
+      "loss": 0.018,
+      "num_tokens": 10124924.0,
+      "reward": 1.0982143878936768,
+      "reward_std": 0.16575203835964203,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1026785746216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.30387791991233826,
+      "step": 22
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1185.0,
+      "completions/max_terminated_length": 1185.0,
+      "completions/mean_length": 341.2589416503906,
+      "completions/mean_terminated_length": 341.2589416503906,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "epoch": 0.09489427539969056,
+      "grad_norm": 0.2108008712530136,
+      "kl": 0.01441192626953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0101,
+      "num_tokens": 10540357.0,
+      "reward": 1.078125,
+      "reward_std": 0.13501358032226562,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843171834946,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0848214253783226,
+      "rewards/curriculum_aware_reward_fn/std": 0.2789272665977478,
+      "step": 23
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1019.0,
+      "completions/max_terminated_length": 1019.0,
+      "completions/mean_length": 358.1629638671875,
+      "completions/mean_terminated_length": 358.1629638671875,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "epoch": 0.09902011346054668,
+      "grad_norm": 0.22388455271720886,
+      "kl": 0.01308441162109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0167,
+      "num_tokens": 10973079.0,
+      "reward": 1.118303656578064,
+      "reward_std": 0.15738239884376526,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1227678582072258,
+      "rewards/curriculum_aware_reward_fn/std": 0.3285374045372009,
+      "step": 24
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1847.0,
+      "completions/max_terminated_length": 1847.0,
+      "completions/mean_length": 345.7008972167969,
+      "completions/mean_terminated_length": 345.7008972167969,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "epoch": 0.10314595152140278,
+      "grad_norm": 0.9551708102226257,
+      "kl": 0.07257080078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0215,
+      "num_tokens": 11398370.0,
+      "reward": 1.1294643878936768,
+      "reward_std": 0.18270127475261688,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133,
+      "step": 25
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1139.0,
+      "completions/max_terminated_length": 1139.0,
+      "completions/mean_length": 354.1875305175781,
+      "completions/mean_terminated_length": 354.1875305175781,
+      "completions/min_length": 61.0,
+      "completions/min_terminated_length": 61.0,
+      "epoch": 0.1072717895822589,
+      "grad_norm": 0.20303034782409668,
+      "kl": 0.0153656005859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0156,
+      "num_tokens": 11834365.0,
+      "reward": 1.102678656578064,
+      "reward_std": 0.136513814330101,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1026785746216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.30387789011001587,
+      "step": 26
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1112.0,
+      "completions/max_terminated_length": 1112.0,
+      "completions/mean_length": 348.15179443359375,
+      "completions/mean_terminated_length": 348.15179443359375,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 0.111397627643115,
+      "grad_norm": 0.23060455918312073,
+      "kl": 0.01444244384765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0246,
+      "num_tokens": 12265857.0,
+      "reward": 1.118303656578064,
+      "reward_std": 0.19625361263751984,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3377779722213745,
+      "step": 27
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1949.0,
+      "completions/max_terminated_length": 1949.0,
+      "completions/mean_length": 341.3035888671875,
+      "completions/mean_terminated_length": 341.3035888671875,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "epoch": 0.11552346570397112,
+      "grad_norm": 0.21146497130393982,
+      "kl": 0.0152740478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0083,
+      "num_tokens": 12689969.0,
+      "reward": 1.1205358505249023,
+      "reward_std": 0.14929363131523132,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176,
+      "step": 28
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2630.0,
+      "completions/max_terminated_length": 2630.0,
+      "completions/mean_length": 342.83929443359375,
+      "completions/mean_terminated_length": 342.83929443359375,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "epoch": 0.11964930376482723,
+      "grad_norm": 0.2256154865026474,
+      "kl": 0.01830291748046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0169,
+      "num_tokens": 13120551.0,
+      "reward": 1.1160714626312256,
+      "reward_std": 0.168566033244133,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996,
+      "step": 29
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2030.0,
+      "completions/max_terminated_length": 2030.0,
+      "completions/mean_length": 362.7812805175781,
+      "completions/mean_terminated_length": 362.7812805175781,
+      "completions/min_length": 107.0,
+      "completions/min_terminated_length": 107.0,
+      "epoch": 0.12377514182568335,
+      "grad_norm": 0.2010965794324875,
+      "kl": 0.015350341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0161,
+      "num_tokens": 13546445.0,
+      "reward": 1.109375,
+      "reward_std": 0.15386441349983215,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1116071417927742,
+      "rewards/curriculum_aware_reward_fn/std": 0.31523454189300537,
+      "step": 30
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1668.0,
+      "completions/max_terminated_length": 1668.0,
+      "completions/mean_length": 363.0826110839844,
+      "completions/mean_terminated_length": 363.0826110839844,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "epoch": 0.12790097988653945,
+      "grad_norm": 0.171152725815773,
+      "kl": 0.01485443115234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 13991038.0,
+      "reward": 1.1272321939468384,
+      "reward_std": 0.1273893564939499,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957,
+      "step": 31
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1164.0,
+      "completions/max_terminated_length": 1164.0,
+      "completions/mean_length": 357.60491943359375,
+      "completions/mean_terminated_length": 357.60491943359375,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 0.13202681794739557,
+      "grad_norm": 0.22520698606967926,
+      "kl": 0.01274871826171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0133,
+      "num_tokens": 14421823.0,
+      "reward": 1.1361607313156128,
+      "reward_std": 0.17906279861927032,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1361607164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.34334251284599304,
+      "step": 32
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1669.0,
+      "completions/max_terminated_length": 1669.0,
+      "completions/mean_length": 375.04913330078125,
+      "completions/mean_terminated_length": 375.04913330078125,
+      "completions/min_length": 97.0,
+      "completions/min_terminated_length": 97.0,
+      "epoch": 0.1361526560082517,
+      "grad_norm": 0.25881803035736084,
+      "kl": 0.0135498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0069,
+      "num_tokens": 14867486.0,
+      "reward": 1.0825893878936768,
+      "reward_std": 0.16945692896842957,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0870535746216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.2822286784648895,
+      "step": 33
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1557.0,
+      "completions/max_terminated_length": 1557.0,
+      "completions/mean_length": 373.47100830078125,
+      "completions/mean_terminated_length": 373.47100830078125,
+      "completions/min_length": 107.0,
+      "completions/min_terminated_length": 107.0,
+      "epoch": 0.14027849406910778,
+      "grad_norm": 0.22023342549800873,
+      "kl": 0.01361846923828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0065,
+      "num_tokens": 15295971.0,
+      "reward": 1.0982143878936768,
+      "reward_std": 0.16011416912078857,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1004464253783226,
+      "rewards/curriculum_aware_reward_fn/std": 0.30093035101890564,
+      "step": 34
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1468.0,
+      "completions/max_terminated_length": 1468.0,
+      "completions/mean_length": 359.8437805175781,
+      "completions/mean_terminated_length": 359.8437805175781,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "epoch": 0.1444043321299639,
+      "grad_norm": 0.2497127801179886,
+      "kl": 0.01373291015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0251,
+      "num_tokens": 15741577.0,
+      "reward": 1.1450893878936768,
+      "reward_std": 0.22232261300086975,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.35703200101852417,
+      "step": 35
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1656.0,
+      "completions/max_terminated_length": 1656.0,
+      "completions/mean_length": 371.7745666503906,
+      "completions/mean_terminated_length": 371.7745666503906,
+      "completions/min_length": 89.0,
+      "completions/min_terminated_length": 89.0,
+      "epoch": 0.14853017019082002,
+      "grad_norm": 0.2346869707107544,
+      "kl": 0.01375579833984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0004,
+      "num_tokens": 16177408.0,
+      "reward": 1.1205357313156128,
+      "reward_std": 0.19378496706485748,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996,
+      "step": 36
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3303.0,
+      "completions/max_terminated_length": 3303.0,
+      "completions/mean_length": 377.1004638671875,
+      "completions/mean_terminated_length": 377.1004638671875,
+      "completions/min_length": 107.0,
+      "completions/min_terminated_length": 107.0,
+      "epoch": 0.15265600825167613,
+      "grad_norm": 0.20092125236988068,
+      "kl": 0.011962890625,
+      "learning_rate": 1e-06,
+      "loss": 0.01,
+      "num_tokens": 16614421.0,
+      "reward": 1.087053656578064,
+      "reward_std": 0.1473119556903839,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0870535746216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.2822286784648895,
+      "step": 37
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3311.0,
+      "completions/max_terminated_length": 3311.0,
+      "completions/mean_length": 371.7276916503906,
+      "completions/mean_terminated_length": 371.7276916503906,
+      "completions/min_length": 81.0,
+      "completions/min_terminated_length": 81.0,
+      "epoch": 0.15678184631253222,
+      "grad_norm": 0.20405592024326324,
+      "kl": 0.0123443603515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0315,
+      "num_tokens": 17038443.0,
+      "reward": 1.1294643878936768,
+      "reward_std": 0.17602959275245667,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133,
+      "step": 38
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2603.0,
+      "completions/max_terminated_length": 2603.0,
+      "completions/mean_length": 392.85491943359375,
+      "completions/mean_terminated_length": 392.85491943359375,
+      "completions/min_length": 137.0,
+      "completions/min_terminated_length": 137.0,
+      "epoch": 0.16090768437338834,
+      "grad_norm": 0.21517439186573029,
+      "kl": 0.01271820068359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0026,
+      "num_tokens": 17485083.0,
+      "reward": 1.0982143878936768,
+      "reward_std": 0.1524314433336258,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1004464253783226,
+      "rewards/curriculum_aware_reward_fn/std": 0.30093035101890564,
+      "step": 39
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1344.0,
+      "completions/max_terminated_length": 1344.0,
+      "completions/mean_length": 376.9107360839844,
+      "completions/mean_terminated_length": 376.9107360839844,
+      "completions/min_length": 118.0,
+      "completions/min_terminated_length": 118.0,
+      "epoch": 0.16503352243424446,
+      "grad_norm": 0.25572851300239563,
+      "kl": 0.01190948486328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0137,
+      "num_tokens": 17917580.0,
+      "reward": 1.102678656578064,
+      "reward_std": 0.1929987519979477,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1071428582072258,
+      "rewards/curriculum_aware_reward_fn/std": 0.3096405565738678,
+      "step": 40
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1003.0,
+      "completions/mean_length": 378.2388610839844,
+      "completions/mean_terminated_length": 369.92169189453125,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "epoch": 0.16915936049510058,
+      "grad_norm": 2.37631893157959,
+      "kl": 0.1428680419921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0265,
+      "num_tokens": 18357651.0,
+      "reward": 1.1227679252624512,
+      "reward_std": 0.15738239884376526,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996,
+      "step": 41
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1205.0,
+      "completions/max_terminated_length": 1205.0,
+      "completions/mean_length": 330.35491943359375,
+      "completions/mean_terminated_length": 330.35491943359375,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "epoch": 0.17328519855595667,
+      "grad_norm": 0.2657046616077423,
+      "kl": 0.01459503173828125,
+      "learning_rate": 1e-06,
+      "loss": 0.0266,
+      "num_tokens": 18747024.0,
+      "reward": 1.1450893878936768,
+      "reward_std": 0.22222581505775452,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.36136937141418457,
+      "step": 42
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2859.0,
+      "completions/mean_length": 376.16741943359375,
+      "completions/mean_terminated_length": 367.84564208984375,
+      "completions/min_length": 104.0,
+      "completions/min_terminated_length": 104.0,
+      "epoch": 0.1774110366168128,
+      "grad_norm": 0.23751172423362732,
+      "kl": 0.011627197265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0478,
+      "num_tokens": 19176227.0,
+      "reward": 1.165178656578064,
+      "reward_std": 0.22029609978199005,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457,
+      "step": 43
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1600.0,
+      "completions/max_terminated_length": 1600.0,
+      "completions/mean_length": 371.5602722167969,
+      "completions/mean_terminated_length": 371.5602722167969,
+      "completions/min_length": 67.0,
+      "completions/min_terminated_length": 67.0,
+      "epoch": 0.1815368746776689,
+      "grad_norm": 0.22557100653648376,
+      "kl": 0.0122222900390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0307,
+      "num_tokens": 19588183.0,
+      "reward": 1.0892857313156128,
+      "reward_std": 0.19364552199840546,
+      "rewards/code_format_reward/mean": 0.9888392686843872,
+      "rewards/code_format_reward/std": 0.10517053306102753,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1004464253783226,
+      "rewards/curriculum_aware_reward_fn/std": 0.30093035101890564,
+      "step": 44
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1344.0,
+      "completions/max_terminated_length": 1344.0,
+      "completions/mean_length": 381.6026916503906,
+      "completions/mean_terminated_length": 381.6026916503906,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "epoch": 0.18566271273852503,
+      "grad_norm": 0.24813039600849152,
+      "kl": 0.0107421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0071,
+      "num_tokens": 20048563.0,
+      "reward": 1.118303656578064,
+      "reward_std": 0.18272370100021362,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.32595089077949524,
+      "step": 45
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3901.0,
+      "completions/max_terminated_length": 3901.0,
+      "completions/mean_length": 373.51116943359375,
+      "completions/mean_terminated_length": 373.51116943359375,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "epoch": 0.18978855079938112,
+      "grad_norm": 0.25957804918289185,
+      "kl": 0.010589599609375,
+      "learning_rate": 1e-06,
+      "loss": 0.021,
+      "num_tokens": 20484045.0,
+      "reward": 1.125,
+      "reward_std": 0.22440890967845917,
+      "rewards/code_format_reward/mean": 0.9799107313156128,
+      "rewards/code_format_reward/std": 0.14046260714530945,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3525845408439636,
+      "step": 46
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1483.0,
+      "completions/max_terminated_length": 1483.0,
+      "completions/mean_length": 377.4888610839844,
+      "completions/mean_terminated_length": 377.4888610839844,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "epoch": 0.19391438886023724,
+      "grad_norm": 0.24632863700389862,
+      "kl": 0.010894775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0223,
+      "num_tokens": 20923290.0,
+      "reward": 1.1584821939468384,
+      "reward_std": 0.23042914271354675,
+      "rewards/code_format_reward/mean": 0.9866071343421936,
+      "rewards/code_format_reward/std": 0.11507844179868698,
+      "rewards/curriculum_aware_reward_fn/mean": 0.171875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775,
+      "step": 47
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1178.0,
+      "completions/max_terminated_length": 1178.0,
+      "completions/mean_length": 340.9508972167969,
+      "completions/mean_terminated_length": 340.9508972167969,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 0.19804022692109335,
+      "grad_norm": 0.29566898941993713,
+      "kl": 0.01230621337890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0087,
+      "num_tokens": 21330717.0,
+      "reward": 1.149553656578064,
+      "reward_std": 0.23722697794437408,
+      "rewards/code_format_reward/mean": 0.984375,
+      "rewards/code_format_reward/std": 0.12415824085474014,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.37175679206848145,
+      "step": 48
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1488.0,
+      "completions/max_terminated_length": 1488.0,
+      "completions/mean_length": 348.87054443359375,
+      "completions/mean_terminated_length": 348.87054443359375,
+      "completions/min_length": 74.0,
+      "completions/min_terminated_length": 74.0,
+      "epoch": 0.20216606498194944,
+      "grad_norm": 0.2383718490600586,
+      "kl": 0.0120697021484375,
+      "learning_rate": 1e-06,
+      "loss": -0.0001,
+      "num_tokens": 21756945.0,
+      "reward": 1.087053656578064,
+      "reward_std": 0.1802413910627365,
+      "rewards/code_format_reward/mean": 0.984375,
+      "rewards/code_format_reward/std": 0.12415824085474014,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1026785746216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.31115278601646423,
+      "step": 49
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1112.0,
+      "completions/max_terminated_length": 1112.0,
+      "completions/mean_length": 348.9754638671875,
+      "completions/mean_terminated_length": 348.9754638671875,
+      "completions/min_length": 84.0,
+      "completions/min_terminated_length": 84.0,
+      "epoch": 0.20629190304280556,
+      "grad_norm": 0.2150750309228897,
+      "kl": 0.01216888427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0136,
+      "num_tokens": 22173742.0,
+      "reward": 1.109375,
+      "reward_std": 0.1667092889547348,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1183035746216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.32332825660705566,
+      "step": 50
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1356.0,
+      "completions/max_terminated_length": 1356.0,
+      "completions/mean_length": 340.68304443359375,
+      "completions/mean_terminated_length": 340.68304443359375,
+      "completions/min_length": 102.0,
+      "completions/min_terminated_length": 102.0,
+      "epoch": 0.21041774110366168,
+      "grad_norm": 0.24686773121356964,
+      "kl": 0.0117034912109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0149,
+      "num_tokens": 22590268.0,
+      "reward": 1.149553656578064,
+      "reward_std": 0.17362166941165924,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.36136940121650696,
+      "step": 51
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1582.0,
+      "completions/max_terminated_length": 1582.0,
+      "completions/mean_length": 364.9888610839844,
+      "completions/mean_terminated_length": 364.9888610839844,
+      "completions/min_length": 83.0,
+      "completions/min_terminated_length": 83.0,
+      "epoch": 0.2145435791645178,
+      "grad_norm": 0.24674323201179504,
+      "kl": 0.011444091796875,
+      "learning_rate": 1e-06,
+      "loss": -0.0145,
+      "num_tokens": 23044836.0,
+      "reward": 1.1049107313156128,
+      "reward_std": 0.20420950651168823,
+      "rewards/code_format_reward/mean": 0.9799107313156128,
+      "rewards/code_format_reward/std": 0.14046260714530945,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996,
+      "step": 52
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1734.0,
+      "completions/max_terminated_length": 1734.0,
+      "completions/mean_length": 349.61163330078125,
+      "completions/mean_terminated_length": 349.61163330078125,
+      "completions/min_length": 76.0,
+      "completions/min_terminated_length": 76.0,
+      "epoch": 0.2186694172253739,
+      "grad_norm": 0.27109435200691223,
+      "kl": 0.0108489990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0112,
+      "num_tokens": 23465278.0,
+      "reward": 1.1316965818405151,
+      "reward_std": 0.2340708076953888,
+      "rewards/code_format_reward/mean": 0.984375,
+      "rewards/code_format_reward/std": 0.12415824085474014,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437,
+      "step": 53
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3222.0,
+      "completions/max_terminated_length": 3222.0,
+      "completions/mean_length": 359.0558166503906,
+      "completions/mean_terminated_length": 359.0558166503906,
+      "completions/min_length": 56.0,
+      "completions/min_terminated_length": 56.0,
+      "epoch": 0.22279525528623,
+      "grad_norm": 0.2768162488937378,
+      "kl": 0.01114654541015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0068,
+      "num_tokens": 23879647.0,
+      "reward": 1.118303656578064,
+      "reward_std": 0.2705444395542145,
+      "rewards/code_format_reward/mean": 0.9754464030265808,
+      "rewards/code_format_reward/std": 0.1549331247806549,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361,
+      "step": 54
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1822.0,
+      "completions/max_terminated_length": 1822.0,
+      "completions/mean_length": 355.20538330078125,
+      "completions/mean_terminated_length": 355.20538330078125,
+      "completions/min_length": 78.0,
+      "completions/min_terminated_length": 78.0,
+      "epoch": 0.22692109334708613,
+      "grad_norm": 0.3178320527076721,
+      "kl": 0.01139068603515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0191,
+      "num_tokens": 24333101.0,
+      "reward": 1.071428656578064,
+      "reward_std": 0.24884316325187683,
+      "rewards/code_format_reward/mean": 0.9620535969734192,
+      "rewards/code_format_reward/std": 0.191280335187912,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3124580383300781,
+      "step": 55
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 886.0,
+      "completions/mean_length": 361.8750305175781,
+      "completions/mean_terminated_length": 353.521240234375,
+      "completions/min_length": 113.0,
+      "completions/min_terminated_length": 113.0,
+      "epoch": 0.23104693140794225,
+      "grad_norm": 0.23652935028076172,
+      "kl": 0.01201629638671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0182,
+      "num_tokens": 24784965.0,
+      "reward": 1.046875,
+      "reward_std": 0.16329465806484222,
+      "rewards/code_format_reward/mean": 0.9754464030265808,
+      "rewards/code_format_reward/std": 0.1549331247806549,
+      "rewards/curriculum_aware_reward_fn/mean": 0.0714285746216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.2578272819519043,
+      "step": 56
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2192.0,
+      "completions/max_terminated_length": 2192.0,
+      "completions/mean_length": 353.0982360839844,
+      "completions/mean_terminated_length": 353.0982360839844,
+      "completions/min_length": 85.0,
+      "completions/min_terminated_length": 85.0,
+      "epoch": 0.23517276946879834,
+      "grad_norm": 0.24375389516353607,
+      "kl": 0.01190185546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0192,
+      "num_tokens": 25220899.0,
+      "reward": 1.1450893878936768,
+      "reward_std": 0.23259766399860382,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.37354570627212524,
+      "step": 57
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1225.0,
+      "completions/max_terminated_length": 1225.0,
+      "completions/mean_length": 322.4375,
+      "completions/mean_terminated_length": 322.4375,
+      "completions/min_length": 114.0,
+      "completions/min_terminated_length": 114.0,
+      "epoch": 0.23929860752965446,
+      "grad_norm": 0.2876569628715515,
+      "kl": 0.01218414306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0041,
+      "num_tokens": 25626276.0,
+      "reward": 1.165178656578064,
+      "reward_std": 0.23654192686080933,
+      "rewards/code_format_reward/mean": 0.9821428656578064,
+      "rewards/code_format_reward/std": 0.13258016109466553,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.387128084897995,
+      "step": 58
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1461.0,
+      "completions/max_terminated_length": 1461.0,
+      "completions/mean_length": 352.4129638671875,
+      "completions/mean_terminated_length": 352.4129638671875,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "epoch": 0.24342444559051057,
+      "grad_norm": 0.2399449199438095,
+      "kl": 0.01129150390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0103,
+      "num_tokens": 26054007.0,
+      "reward": 1.1071429252624512,
+      "reward_std": 0.19277828931808472,
+      "rewards/code_format_reward/mean": 0.9799107313156128,
+      "rewards/code_format_reward/std": 0.14046260714530945,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957,
+      "step": 59
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1003.0,
+      "completions/max_terminated_length": 1003.0,
+      "completions/mean_length": 339.77679443359375,
+      "completions/mean_terminated_length": 339.77679443359375,
+      "completions/min_length": 97.0,
+      "completions/min_terminated_length": 97.0,
+      "epoch": 0.2475502836513667,
+      "grad_norm": 0.23095594346523285,
+      "kl": 0.01131439208984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0026,
+      "num_tokens": 26487718.0,
+      "reward": 1.133928656578064,
+      "reward_std": 0.1849469542503357,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361,
+      "step": 60
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1105.0,
+      "completions/max_terminated_length": 1105.0,
+      "completions/mean_length": 339.27679443359375,
+      "completions/mean_terminated_length": 339.27679443359375,
+      "completions/min_length": 134.0,
+      "completions/min_terminated_length": 134.0,
+      "epoch": 0.2516761217122228,
+      "grad_norm": 0.25108498334884644,
+      "kl": 0.0118255615234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0117,
+      "num_tokens": 26899939.0,
+      "reward": 1.1540179252624512,
+      "reward_std": 0.22935986518859863,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3676777780056,
+      "step": 61
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1374.0,
+      "completions/max_terminated_length": 1374.0,
+      "completions/mean_length": 358.0692138671875,
+      "completions/mean_terminated_length": 358.0692138671875,
+      "completions/min_length": 112.0,
+      "completions/min_terminated_length": 112.0,
+      "epoch": 0.2558019597730789,
+      "grad_norm": 0.25201165676116943,
+      "kl": 0.01178741455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0151,
+      "num_tokens": 27344141.0,
+      "reward": 1.1138393878936768,
+      "reward_std": 0.2197514921426773,
+      "rewards/code_format_reward/mean": 0.984375,
+      "rewards/code_format_reward/std": 0.12415824085474014,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1294642835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3360883891582489,
+      "step": 62
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1323.0,
+      "completions/max_terminated_length": 1323.0,
+      "completions/mean_length": 343.32366943359375,
+      "completions/mean_terminated_length": 343.32366943359375,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "epoch": 0.259927797833935,
+      "grad_norm": 0.28083741664886475,
+      "kl": 0.01128387451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0269,
+      "num_tokens": 27760464.0,
+      "reward": 1.1741071939468384,
+      "reward_std": 0.25979822874069214,
+      "rewards/code_format_reward/mean": 0.9866071343421936,
+      "rewards/code_format_reward/std": 0.11507843434810638,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313,
+      "step": 63
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2247.0,
+      "completions/max_terminated_length": 2247.0,
+      "completions/mean_length": 379.7344055175781,
+      "completions/mean_terminated_length": 379.7344055175781,
+      "completions/min_length": 102.0,
+      "completions/min_terminated_length": 102.0,
+      "epoch": 0.26405363589479114,
+      "grad_norm": 0.23068185150623322,
+      "kl": 0.01107025146484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0123,
+      "num_tokens": 28210466.0,
+      "reward": 1.1049107313156128,
+      "reward_std": 0.161203071475029,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843171834946,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1116071417927742,
+      "rewards/curriculum_aware_reward_fn/std": 0.315234512090683,
+      "step": 64
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2890.0,
+      "completions/max_terminated_length": 2890.0,
+      "completions/mean_length": 363.1094055175781,
+      "completions/mean_terminated_length": 363.1094055175781,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "epoch": 0.26817947395564723,
+      "grad_norm": 0.24125061929225922,
+      "kl": 0.01074981689453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0108,
+      "num_tokens": 28642520.0,
+      "reward": 1.09375,
+      "reward_std": 0.1778312474489212,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.09375,
+      "rewards/curriculum_aware_reward_fn/std": 0.2918064594268799,
+      "step": 65
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1161.0,
+      "completions/max_terminated_length": 1161.0,
+      "completions/mean_length": 346.2187805175781,
+      "completions/mean_terminated_length": 346.2187805175781,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "epoch": 0.2723053120165034,
+      "grad_norm": 0.2731747329235077,
+      "kl": 0.010772705078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0145,
+      "num_tokens": 29067665.0,
+      "reward": 1.171875,
+      "reward_std": 0.23607081174850464,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1763392835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3815346360206604,
+      "step": 66
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 828.0,
+      "completions/max_terminated_length": 828.0,
+      "completions/mean_length": 331.7723388671875,
+      "completions/mean_terminated_length": 331.7723388671875,
+      "completions/min_length": 112.0,
+      "completions/min_terminated_length": 112.0,
+      "epoch": 0.27643115007735947,
+      "grad_norm": 0.255914568901062,
+      "kl": 0.0155487060546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0129,
+      "num_tokens": 29473388.0,
+      "reward": 1.1160714626312256,
+      "reward_std": 0.16547304391860962,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996,
+      "step": 67
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1969.0,
+      "completions/max_terminated_length": 1969.0,
+      "completions/mean_length": 344.6473388671875,
+      "completions/mean_terminated_length": 344.6473388671875,
+      "completions/min_length": 118.0,
+      "completions/min_terminated_length": 118.0,
+      "epoch": 0.28055698813821556,
+      "grad_norm": 0.23872235417366028,
+      "kl": 0.0127410888671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0105,
+      "num_tokens": 29892068.0,
+      "reward": 1.1629464626312256,
+      "reward_std": 0.18316583335399628,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.37175679206848145,
+      "step": 68
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1392.0,
+      "completions/max_terminated_length": 1392.0,
+      "completions/mean_length": 345.1629638671875,
+      "completions/mean_terminated_length": 345.1629638671875,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "epoch": 0.2846828261990717,
+      "grad_norm": 0.23450906574726105,
+      "kl": 0.01248931884765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 30304693.0,
+      "reward": 1.1160714626312256,
+      "reward_std": 0.20246198773384094,
+      "rewards/code_format_reward/mean": 0.9888392686843872,
+      "rewards/code_format_reward/std": 0.10517053306102753,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957,
+      "step": 69
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 942.0,
+      "completions/max_terminated_length": 942.0,
+      "completions/mean_length": 351.6607360839844,
+      "completions/mean_terminated_length": 351.6607360839844,
+      "completions/min_length": 112.0,
+      "completions/min_terminated_length": 112.0,
+      "epoch": 0.2888086642599278,
+      "grad_norm": 0.2397337406873703,
+      "kl": 0.01116943359375,
+      "learning_rate": 1e-06,
+      "loss": 0.003,
+      "num_tokens": 30717856.0,
+      "reward": 1.1116071939468384,
+      "reward_std": 0.19447438418865204,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843171834946,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1183035746216774,
+      "rewards/curriculum_aware_reward_fn/std": 0.32332828640937805,
+      "step": 70
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1005.0,
+      "completions/max_terminated_length": 1005.0,
+      "completions/mean_length": 340.13616943359375,
+      "completions/mean_terminated_length": 340.13616943359375,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 100.0,
+      "epoch": 0.2929345023207839,
+      "grad_norm": 0.267532080411911,
+      "kl": 0.0149688720703125,
+      "learning_rate": 1e-06,
+      "loss": -0.0061,
+      "num_tokens": 31143909.0,
+      "reward": 1.1696429252624512,
+      "reward_std": 0.22853097319602966,
+      "rewards/code_format_reward/mean": 0.9866071343421936,
+      "rewards/code_format_reward/std": 0.11507843434810638,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.387128084897995,
+      "step": 71
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1265.0,
+      "completions/max_terminated_length": 1265.0,
+      "completions/mean_length": 346.5870666503906,
+      "completions/mean_terminated_length": 346.5870666503906,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "epoch": 0.29706034038164003,
+      "grad_norm": 0.29796159267425537,
+      "kl": 0.01158905029296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0091,
+      "num_tokens": 31579635.0,
+      "reward": 1.1741071939468384,
+      "reward_std": 0.2576758563518524,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.387128084897995,
+      "step": 72
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1037.0,
+      "completions/max_terminated_length": 1037.0,
+      "completions/mean_length": 344.6763610839844,
+      "completions/mean_terminated_length": 344.6763610839844,
+      "completions/min_length": 106.0,
+      "completions/min_terminated_length": 106.0,
+      "epoch": 0.3011861784424961,
+      "grad_norm": 0.2371477484703064,
+      "kl": 0.01210784912109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0122,
+      "num_tokens": 31999301.0,
+      "reward": 1.140625,
+      "reward_std": 0.18199601769447327,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.35664716362953186,
+      "step": 73
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3149.0,
+      "completions/max_terminated_length": 3149.0,
+      "completions/mean_length": 387.5067138671875,
+      "completions/mean_terminated_length": 387.5067138671875,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "epoch": 0.30531201650335227,
+      "grad_norm": 0.20645320415496826,
+      "kl": 0.0110015869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0236,
+      "num_tokens": 32465707.0,
+      "reward": 1.1294643878936768,
+      "reward_std": 0.17819245159626007,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133,
+      "step": 74
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1343.0,
+      "completions/max_terminated_length": 1343.0,
+      "completions/mean_length": 362.12725830078125,
+      "completions/mean_terminated_length": 362.12725830078125,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "epoch": 0.30943785456420836,
+      "grad_norm": 0.23689766228199005,
+      "kl": 0.0115509033203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0074,
+      "num_tokens": 32894283.0,
+      "reward": 1.1116071939468384,
+      "reward_std": 0.1989629715681076,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176,
+      "step": 75
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1811.0,
+      "completions/max_terminated_length": 1811.0,
+      "completions/mean_length": 378.2901916503906,
+      "completions/mean_terminated_length": 378.2901916503906,
+      "completions/min_length": 133.0,
+      "completions/min_terminated_length": 133.0,
+      "epoch": 0.31356369262506445,
+      "grad_norm": 0.2228821963071823,
+      "kl": 0.0117340087890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0148,
+      "num_tokens": 33348734.0,
+      "reward": 1.1383929252624512,
+      "reward_std": 0.1881648153066635,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.3480229377746582,
+      "step": 76
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2446.0,
+      "completions/max_terminated_length": 2446.0,
+      "completions/mean_length": 400.8750305175781,
+      "completions/mean_terminated_length": 400.8750305175781,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 0.3176895306859206,
+      "grad_norm": 0.2140088975429535,
+      "kl": 0.0146026611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0284,
+      "num_tokens": 33807640.0,
+      "reward": 1.1674107313156128,
+      "reward_std": 0.18470536172389984,
+      "rewards/code_format_reward/mean": 0.9888392686843872,
+      "rewards/code_format_reward/std": 0.10517053306102753,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.42229342460632324,
+      "step": 77
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1205.0,
+      "completions/max_terminated_length": 1205.0,
+      "completions/mean_length": 372.58929443359375,
+      "completions/mean_terminated_length": 372.58929443359375,
+      "completions/min_length": 94.0,
+      "completions/min_terminated_length": 94.0,
+      "epoch": 0.3218153687467767,
+      "grad_norm": 0.24875551462173462,
+      "kl": 0.01198577880859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0117,
+      "num_tokens": 34250108.0,
+      "reward": 1.165178656578064,
+      "reward_std": 0.20772093534469604,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276,
+      "step": 78
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1497.0,
+      "completions/mean_length": 375.45538330078125,
+      "completions/mean_terminated_length": 367.1319885253906,
+      "completions/min_length": 114.0,
+      "completions/min_terminated_length": 114.0,
+      "epoch": 0.3259412068076328,
+      "grad_norm": 0.24669714272022247,
+      "kl": 0.01181793212890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0246,
+      "num_tokens": 34689597.0,
+      "reward": 1.133928656578064,
+      "reward_std": 0.17734427750110626,
+      "rewards/code_format_reward/mean": 0.9888392686843872,
+      "rewards/code_format_reward/std": 0.10517053306102753,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.352584570646286,
+      "step": 79
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1325.0,
+      "completions/max_terminated_length": 1325.0,
+      "completions/mean_length": 345.0714416503906,
+      "completions/mean_terminated_length": 345.0714416503906,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 0.3300670448684889,
+      "grad_norm": 0.221415176987648,
+      "kl": 0.012115478515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0223,
+      "num_tokens": 35106479.0,
+      "reward": 1.1473214626312256,
+      "reward_std": 0.1726084053516388,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.3570319712162018,
+      "step": 80
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1138.0,
+      "completions/max_terminated_length": 1138.0,
+      "completions/mean_length": 360.6227722167969,
+      "completions/mean_terminated_length": 360.6227722167969,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "epoch": 0.334192882929345,
+      "grad_norm": 0.21861205995082855,
+      "kl": 0.01169586181640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0136,
+      "num_tokens": 35539442.0,
+      "reward": 1.1897321939468384,
+      "reward_std": 0.20504766702651978,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.4253509044647217,
+      "step": 81
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3323.0,
+      "completions/max_terminated_length": 3323.0,
+      "completions/mean_length": 371.7656555175781,
+      "completions/mean_terminated_length": 371.7656555175781,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "epoch": 0.33831872099020116,
+      "grad_norm": 0.24164587259292603,
+      "kl": 0.0117950439453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0235,
+      "num_tokens": 35974317.0,
+      "reward": 1.165178656578064,
+      "reward_std": 0.2279203236103058,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276,
+      "step": 82
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1651.0,
+      "completions/max_terminated_length": 1651.0,
+      "completions/mean_length": 372.0469055175781,
+      "completions/mean_terminated_length": 372.0469055175781,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "epoch": 0.34244455905105725,
+      "grad_norm": 0.27285560965538025,
+      "kl": 0.0119781494140625,
+      "learning_rate": 1e-06,
+      "loss": -0.0017,
+      "num_tokens": 36432978.0,
+      "reward": 1.1919643878936768,
+      "reward_std": 0.24956144392490387,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.39774051308631897,
+      "step": 83
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1329.0,
+      "completions/max_terminated_length": 1329.0,
+      "completions/mean_length": 395.25225830078125,
+      "completions/mean_terminated_length": 395.25225830078125,
+      "completions/min_length": 118.0,
+      "completions/min_terminated_length": 118.0,
+      "epoch": 0.34657039711191334,
+      "grad_norm": 0.220729798078537,
+      "kl": 0.01096343994140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0066,
+      "num_tokens": 36892984.0,
+      "reward": 1.1205358505249023,
+      "reward_std": 0.18371452391147614,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176,
+      "step": 84
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3794.0,
+      "completions/max_terminated_length": 3794.0,
+      "completions/mean_length": 372.86163330078125,
+      "completions/mean_terminated_length": 372.86163330078125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "epoch": 0.3506962351727695,
+      "grad_norm": 0.2561819851398468,
+      "kl": 0.0115966796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0127,
+      "num_tokens": 37340754.0,
+      "reward": 1.0915179252624512,
+      "reward_std": 0.2180769443511963,
+      "rewards/code_format_reward/mean": 0.984375,
+      "rewards/code_format_reward/std": 0.12415824085474014,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1071428582072258,
+      "rewards/curriculum_aware_reward_fn/std": 0.3096405565738678,
+      "step": 85
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1030.0,
+      "completions/mean_length": 340.00225830078125,
+      "completions/mean_terminated_length": 331.59954833984375,
+      "completions/min_length": 101.0,
+      "completions/min_terminated_length": 101.0,
+      "epoch": 0.3548220732336256,
+      "grad_norm": 0.25818702578544617,
+      "kl": 0.012969970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.044,
+      "num_tokens": 37750619.0,
+      "reward": 1.15625,
+      "reward_std": 0.2230120152235031,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194,
+      "step": 86
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 990.0,
+      "completions/max_terminated_length": 990.0,
+      "completions/mean_length": 349.18975830078125,
+      "completions/mean_terminated_length": 349.18975830078125,
+      "completions/min_length": 79.0,
+      "completions/min_terminated_length": 79.0,
+      "epoch": 0.35894791129448167,
+      "grad_norm": 0.21370843052864075,
+      "kl": 0.0142059326171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0059,
+      "num_tokens": 38191410.0,
+      "reward": 1.140625,
+      "reward_std": 0.16670270264148712,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361,
+      "step": 87
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1830.0,
+      "completions/max_terminated_length": 1830.0,
+      "completions/mean_length": 355.7187805175781,
+      "completions/mean_terminated_length": 355.7187805175781,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "epoch": 0.3630737493553378,
+      "grad_norm": 0.243008553981781,
+      "kl": 0.0123291015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0034,
+      "num_tokens": 38626488.0,
+      "reward": 1.1383929252624512,
+      "reward_std": 0.19419102370738983,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.352584570646286,
+      "step": 88
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1263.0,
+      "completions/max_terminated_length": 1263.0,
+      "completions/mean_length": 361.5245666503906,
+      "completions/mean_terminated_length": 361.5245666503906,
+      "completions/min_length": 107.0,
+      "completions/min_terminated_length": 107.0,
+      "epoch": 0.3671995874161939,
+      "grad_norm": 0.23066678643226624,
+      "kl": 0.0127716064453125,
+      "learning_rate": 1e-06,
+      "loss": -0.0011,
+      "num_tokens": 39070327.0,
+      "reward": 1.1361607313156128,
+      "reward_std": 0.19861890375614166,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843171834946,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361,
+      "step": 89
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1316.0,
+      "completions/max_terminated_length": 1316.0,
+      "completions/mean_length": 344.4508972167969,
+      "completions/mean_terminated_length": 344.4508972167969,
+      "completions/min_length": 109.0,
+      "completions/min_terminated_length": 109.0,
+      "epoch": 0.37132542547705005,
+      "grad_norm": 0.22873297333717346,
+      "kl": 0.0172271728515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0152,
+      "num_tokens": 39504172.0,
+      "reward": 1.118303656578064,
+      "reward_std": 0.18025504052639008,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1227678582072258,
+      "rewards/curriculum_aware_reward_fn/std": 0.3285374343395233,
+      "step": 90
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1097.0,
+      "completions/max_terminated_length": 1097.0,
+      "completions/mean_length": 337.7276916503906,
+      "completions/mean_terminated_length": 337.7276916503906,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "epoch": 0.37545126353790614,
+      "grad_norm": 0.2452411651611328,
+      "kl": 0.01430511474609375,
+      "learning_rate": 1e-06,
+      "loss": 0.013,
+      "num_tokens": 39933654.0,
+      "reward": 1.1316964626312256,
+      "reward_std": 0.1899057924747467,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1339285671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.34095630049705505,
+      "step": 91
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1061.0,
+      "completions/max_terminated_length": 1061.0,
+      "completions/mean_length": 333.578125,
+      "completions/mean_terminated_length": 333.578125,
+      "completions/min_length": 121.0,
+      "completions/min_terminated_length": 121.0,
+      "epoch": 0.37957710159876223,
+      "grad_norm": 0.2378436028957367,
+      "kl": 0.01314544677734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0164,
+      "num_tokens": 40341761.0,
+      "reward": 1.15625,
+      "reward_std": 0.2169431895017624,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.37560540437698364,
+      "step": 92
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 844.0,
+      "completions/max_terminated_length": 844.0,
+      "completions/mean_length": 324.9598388671875,
+      "completions/mean_terminated_length": 324.9598388671875,
+      "completions/min_length": 77.0,
+      "completions/min_terminated_length": 77.0,
+      "epoch": 0.3837029396596184,
+      "grad_norm": 0.2310194969177246,
+      "kl": 0.01369476318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0213,
+      "num_tokens": 40760812.0,
+      "reward": 1.149553656578064,
+      "reward_std": 0.198617085814476,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843171834946,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.3873344361782074,
+      "step": 93
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1514.0,
+      "completions/max_terminated_length": 1514.0,
+      "completions/mean_length": 342.8951110839844,
+      "completions/mean_terminated_length": 342.8951110839844,
+      "completions/min_length": 90.0,
+      "completions/min_terminated_length": 90.0,
+      "epoch": 0.38782877772047447,
+      "grad_norm": 0.2386288344860077,
+      "kl": 0.013671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0181,
+      "num_tokens": 41179939.0,
+      "reward": 1.15625,
+      "reward_std": 0.2045021653175354,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194,
+      "step": 94
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 960.0,
+      "completions/max_terminated_length": 960.0,
+      "completions/mean_length": 331.078125,
+      "completions/mean_terminated_length": 331.078125,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "epoch": 0.39195461578133056,
+      "grad_norm": 0.25872641801834106,
+      "kl": 0.01519775390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0124,
+      "num_tokens": 41603992.0,
+      "reward": 1.1763393878936768,
+      "reward_std": 0.20531079173088074,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3834211826324463,
+      "step": 95
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3766.0,
+      "completions/max_terminated_length": 3766.0,
+      "completions/mean_length": 349.6160888671875,
+      "completions/mean_terminated_length": 349.6160888671875,
+      "completions/min_length": 80.0,
+      "completions/min_terminated_length": 80.0,
+      "epoch": 0.3960804538421867,
+      "grad_norm": 0.22162525355815887,
+      "kl": 0.01556396484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0156,
+      "num_tokens": 42031849.0,
+      "reward": 1.180803656578064,
+      "reward_std": 0.1656801700592041,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.39252743124961853,
+      "step": 96
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 882.0,
+      "completions/max_terminated_length": 882.0,
+      "completions/mean_length": 337.1763610839844,
+      "completions/mean_terminated_length": 337.1763610839844,
+      "completions/min_length": 90.0,
+      "completions/min_terminated_length": 90.0,
+      "epoch": 0.4002062919030428,
+      "grad_norm": 0.26754871010780334,
+      "kl": 0.01500701904296875,
+      "learning_rate": 1e-06,
+      "loss": -0.0018,
+      "num_tokens": 42448883.0,
+      "reward": 1.2321429252624512,
+      "reward_std": 0.2509976029396057,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2388392835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.4522976279258728,
+      "step": 97
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1271.0,
+      "completions/max_terminated_length": 1271.0,
+      "completions/mean_length": 341.90850830078125,
+      "completions/mean_terminated_length": 341.90850830078125,
+      "completions/min_length": 71.0,
+      "completions/min_terminated_length": 71.0,
+      "epoch": 0.4043321299638989,
+      "grad_norm": 0.26659685373306274,
+      "kl": 0.0152587890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0018,
+      "num_tokens": 42877732.0,
+      "reward": 1.2008929252624512,
+      "reward_std": 0.23461629450321198,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2098214328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.42378073930740356,
+      "step": 98
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 945.0,
+      "completions/max_terminated_length": 945.0,
+      "completions/mean_length": 319.0558166503906,
+      "completions/mean_terminated_length": 319.0558166503906,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "epoch": 0.40845796802475504,
+      "grad_norm": 0.24595794081687927,
+      "kl": 0.0162200927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0081,
+      "num_tokens": 43290883.0,
+      "reward": 1.1875,
+      "reward_std": 0.20630162954330444,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313,
+      "step": 99
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1589.0,
+      "completions/mean_length": 328.95538330078125,
+      "completions/mean_terminated_length": 320.5279541015625,
+      "completions/min_length": 98.0,
+      "completions/min_terminated_length": 98.0,
+      "epoch": 0.4125838060856111,
+      "grad_norm": 0.21619565784931183,
+      "kl": 0.015472412109375,
+      "learning_rate": 1e-06,
+      "loss": 0.027,
+      "num_tokens": 43702890.0,
+      "reward": 1.15625,
+      "reward_std": 0.15517687797546387,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3676777780056,
+      "step": 100
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 938.0,
+      "completions/max_terminated_length": 938.0,
+      "completions/mean_length": 339.11163330078125,
+      "completions/mean_terminated_length": 339.11163330078125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "epoch": 0.4167096441464673,
+      "grad_norm": 0.21944324672222137,
+      "kl": 0.014801025390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0124,
+      "num_tokens": 44137822.0,
+      "reward": 1.1361607313156128,
+      "reward_std": 0.195481076836586,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1361607164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.35613569617271423,
+      "step": 101
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1717.0,
+      "completions/max_terminated_length": 1717.0,
+      "completions/mean_length": 325.9352722167969,
+      "completions/mean_terminated_length": 325.9352722167969,
+      "completions/min_length": 101.0,
+      "completions/min_terminated_length": 101.0,
+      "epoch": 0.42083548220732336,
+      "grad_norm": 0.24191616475582123,
+      "kl": 0.0155029296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0029,
+      "num_tokens": 44569189.0,
+      "reward": 1.1875,
+      "reward_std": 0.1999538093805313,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313,
+      "step": 102
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2173.0,
+      "completions/max_terminated_length": 2173.0,
+      "completions/mean_length": 332.6317138671875,
+      "completions/mean_terminated_length": 332.6317138671875,
+      "completions/min_length": 89.0,
+      "completions/min_terminated_length": 89.0,
+      "epoch": 0.42496132026817945,
+      "grad_norm": 0.24171848595142365,
+      "kl": 0.0145721435546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0029,
+      "num_tokens": 44998642.0,
+      "reward": 1.1294643878936768,
+      "reward_std": 0.2073148936033249,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1339285671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.34095630049705505,
+      "step": 103
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1289.0,
+      "completions/max_terminated_length": 1289.0,
+      "completions/mean_length": 328.2901916503906,
+      "completions/mean_terminated_length": 328.2901916503906,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "epoch": 0.4290871583290356,
+      "grad_norm": 0.2536793649196625,
+      "kl": 0.01633453369140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 45401699.0,
+      "reward": 1.102678656578064,
+      "reward_std": 0.19625361263751984,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1049107164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.3067808747291565,
+      "step": 104
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 911.0,
+      "completions/max_terminated_length": 911.0,
+      "completions/mean_length": 293.4933166503906,
+      "completions/mean_terminated_length": 293.4933166503906,
+      "completions/min_length": 84.0,
+      "completions/min_terminated_length": 84.0,
+      "epoch": 0.4332129963898917,
+      "grad_norm": 0.26053330302238464,
+      "kl": 0.017364501953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0221,
+      "num_tokens": 45793262.0,
+      "reward": 1.227678656578064,
+      "reward_std": 0.2376893311738968,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2299107164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.42124560475349426,
+      "step": 105
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1037.0,
+      "completions/max_terminated_length": 1037.0,
+      "completions/mean_length": 305.71429443359375,
+      "completions/mean_terminated_length": 305.71429443359375,
+      "completions/min_length": 87.0,
+      "completions/min_terminated_length": 87.0,
+      "epoch": 0.4373388344507478,
+      "grad_norm": 0.22793447971343994,
+      "kl": 0.0171966552734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0042,
+      "num_tokens": 46204258.0,
+      "reward": 1.1383929252624512,
+      "reward_std": 0.16399335861206055,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1383928507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.34569787979125977,
+      "step": 106
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 977.0,
+      "completions/max_terminated_length": 977.0,
+      "completions/mean_length": 309.25225830078125,
+      "completions/mean_terminated_length": 309.25225830078125,
+      "completions/min_length": 106.0,
+      "completions/min_terminated_length": 106.0,
+      "epoch": 0.44146467251160393,
+      "grad_norm": 0.6037999391555786,
+      "kl": 0.0637969970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0079,
+      "num_tokens": 46595608.0,
+      "reward": 1.212053656578064,
+      "reward_std": 0.2621622085571289,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.41233164072036743,
+      "step": 107
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 878.0,
+      "completions/max_terminated_length": 878.0,
+      "completions/mean_length": 295.12725830078125,
+      "completions/mean_terminated_length": 295.12725830078125,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "epoch": 0.44559051057246,
+      "grad_norm": 0.2791634202003479,
+      "kl": 0.01679229736328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0067,
+      "num_tokens": 46995953.0,
+      "reward": 1.1897321939468384,
+      "reward_std": 0.25247541069984436,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3960230052471161,
+      "step": 108
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1053.0,
+      "completions/max_terminated_length": 1053.0,
+      "completions/mean_length": 300.75225830078125,
+      "completions/mean_terminated_length": 300.75225830078125,
+      "completions/min_length": 98.0,
+      "completions/min_terminated_length": 98.0,
+      "epoch": 0.44971634863331617,
+      "grad_norm": 0.259223073720932,
+      "kl": 0.01723480224609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0161,
+      "num_tokens": 47398069.0,
+      "reward": 1.1986607313156128,
+      "reward_std": 0.18366967141628265,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1986607164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.3994380831718445,
+      "step": 109
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 839.0,
+      "completions/max_terminated_length": 839.0,
+      "completions/mean_length": 306.62725830078125,
+      "completions/mean_terminated_length": 306.62725830078125,
+      "completions/min_length": 87.0,
+      "completions/min_terminated_length": 87.0,
+      "epoch": 0.45384218669417226,
+      "grad_norm": 0.26962050795555115,
+      "kl": 0.01969146728515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0116,
+      "num_tokens": 47807458.0,
+      "reward": 1.1540179252624512,
+      "reward_std": 0.19651676714420319,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194,
+      "step": 110
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 783.0,
+      "completions/max_terminated_length": 783.0,
+      "completions/mean_length": 281.5201110839844,
+      "completions/mean_terminated_length": 281.5201110839844,
+      "completions/min_length": 89.0,
+      "completions/min_terminated_length": 89.0,
+      "epoch": 0.45796802475502835,
+      "grad_norm": 0.288915753364563,
+      "kl": 0.0187835693359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0097,
+      "num_tokens": 48179027.0,
+      "reward": 1.2678571939468384,
+      "reward_std": 0.230044886469841,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2723214328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.4456520676612854,
+      "step": 111
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1853.0,
+      "completions/max_terminated_length": 1853.0,
+      "completions/mean_length": 309.90850830078125,
+      "completions/mean_terminated_length": 309.90850830078125,
+      "completions/min_length": 90.0,
+      "completions/min_terminated_length": 90.0,
+      "epoch": 0.4620938628158845,
+      "grad_norm": 0.3339619040489197,
+      "kl": 0.03505706787109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0112,
+      "num_tokens": 48599777.0,
+      "reward": 1.1428571939468384,
+      "reward_std": 0.17816074192523956,
+      "rewards/code_format_reward/mean": 0.9888392686843872,
+      "rewards/code_format_reward/std": 0.10517053306102753,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.36136940121650696,
+      "step": 112
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 984.0,
+      "completions/max_terminated_length": 984.0,
+      "completions/mean_length": 302.5915222167969,
+      "completions/mean_terminated_length": 302.5915222167969,
+      "completions/min_length": 72.0,
+      "completions/min_terminated_length": 72.0,
+      "epoch": 0.4662197008767406,
+      "grad_norm": 0.2208058387041092,
+      "kl": 0.0186004638671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0224,
+      "num_tokens": 49020525.0,
+      "reward": 1.1540179252624512,
+      "reward_std": 0.16597501933574677,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.36136937141418457,
+      "step": 113
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 920.0,
+      "completions/max_terminated_length": 920.0,
+      "completions/mean_length": 291.4821472167969,
+      "completions/mean_terminated_length": 291.4821472167969,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "epoch": 0.4703455389375967,
+      "grad_norm": 0.3014701306819916,
+      "kl": 0.021331787109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0208,
+      "num_tokens": 49403298.0,
+      "reward": 1.149553656578064,
+      "reward_std": 0.2552274167537689,
+      "rewards/code_format_reward/mean": 0.9888392686843872,
+      "rewards/code_format_reward/std": 0.10517053306102753,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3676777780056,
+      "step": 114
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1006.0,
+      "completions/max_terminated_length": 1006.0,
+      "completions/mean_length": 295.80804443359375,
+      "completions/mean_terminated_length": 295.80804443359375,
+      "completions/min_length": 86.0,
+      "completions/min_terminated_length": 86.0,
+      "epoch": 0.4744713769984528,
+      "grad_norm": 0.28405463695526123,
+      "kl": 0.020538330078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0215,
+      "num_tokens": 49797277.0,
+      "reward": 1.1540180444717407,
+      "reward_std": 0.25069838762283325,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843171834946,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3737127482891083,
+      "step": 115
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 957.0,
+      "completions/mean_length": 320.63616943359375,
+      "completions/mean_terminated_length": 312.1901550292969,
+      "completions/min_length": 102.0,
+      "completions/min_terminated_length": 102.0,
+      "epoch": 0.4785972150593089,
+      "grad_norm": 0.24680057168006897,
+      "kl": 0.016876220703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0261,
+      "num_tokens": 50206376.0,
+      "reward": 1.1674107313156128,
+      "reward_std": 0.22103038430213928,
+      "rewards/code_format_reward/mean": 0.984375,
+      "rewards/code_format_reward/std": 0.12415824085474014,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.387128084897995,
+      "step": 116
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1170.0,
+      "completions/max_terminated_length": 1170.0,
+      "completions/mean_length": 312.59375,
+      "completions/mean_terminated_length": 312.59375,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 100.0,
+      "epoch": 0.48272305312016506,
+      "grad_norm": 0.278852641582489,
+      "kl": 0.01776885986328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0177,
+      "num_tokens": 50625368.0,
+      "reward": 1.15625,
+      "reward_std": 0.21217124164104462,
+      "rewards/code_format_reward/mean": 0.9888392686843872,
+      "rewards/code_format_reward/std": 0.10517053306102753,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457,
+      "step": 117
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 977.0,
+      "completions/max_terminated_length": 977.0,
+      "completions/mean_length": 288.9598388671875,
+      "completions/mean_terminated_length": 288.9598388671875,
+      "completions/min_length": 83.0,
+      "completions/min_terminated_length": 83.0,
+      "epoch": 0.48684889118102115,
+      "grad_norm": 0.29333117604255676,
+      "kl": 0.01859283447265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0121,
+      "num_tokens": 51026044.0,
+      "reward": 1.1517857313156128,
+      "reward_std": 0.23571307957172394,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3676777780056,
+      "step": 118
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1755.0,
+      "completions/max_terminated_length": 1755.0,
+      "completions/mean_length": 304.8258972167969,
+      "completions/mean_terminated_length": 304.8258972167969,
+      "completions/min_length": 98.0,
+      "completions/min_terminated_length": 98.0,
+      "epoch": 0.49097472924187724,
+      "grad_norm": 0.23791854083538055,
+      "kl": 0.017547607421875,
+      "learning_rate": 1e-06,
+      "loss": 0.0042,
+      "num_tokens": 51435060.0,
+      "reward": 1.149553656578064,
+      "reward_std": 0.1614886224269867,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.40789952874183655,
+      "step": 119
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1238.0,
+      "completions/max_terminated_length": 1238.0,
+      "completions/mean_length": 309.35491943359375,
+      "completions/mean_terminated_length": 309.35491943359375,
+      "completions/min_length": 78.0,
+      "completions/min_terminated_length": 78.0,
+      "epoch": 0.4951005673027334,
+      "grad_norm": 0.32031339406967163,
+      "kl": 0.02161407470703125,
+      "learning_rate": 1e-06,
+      "loss": 0.011,
+      "num_tokens": 51836242.0,
+      "reward": 1.180803656578064,
+      "reward_std": 0.3114909827709198,
+      "rewards/code_format_reward/mean": 0.9776785969734192,
+      "rewards/code_format_reward/std": 0.1478918492794037,
+      "rewards/curriculum_aware_reward_fn/mean": 0.203125,
+      "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365,
+      "step": 120
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.004464285714285698,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 891.0,
+      "completions/mean_length": 310.6138610839844,
+      "completions/mean_terminated_length": 293.6390380859375,
+      "completions/min_length": 89.0,
+      "completions/min_terminated_length": 89.0,
+      "epoch": 0.4992264053635895,
+      "grad_norm": 0.26912397146224976,
+      "kl": 0.017120361328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0528,
+      "num_tokens": 52242545.0,
+      "reward": 1.1361608505249023,
+      "reward_std": 0.26077795028686523,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1616371124982834,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928,
+      "step": 121
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1507.0,
+      "completions/max_terminated_length": 1507.0,
+      "completions/mean_length": 304.5602722167969,
+      "completions/mean_terminated_length": 304.5602722167969,
+      "completions/min_length": 76.0,
+      "completions/min_terminated_length": 76.0,
+      "epoch": 0.5033522434244456,
+      "grad_norm": 0.28717559576034546,
+      "kl": 0.016204833984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0229,
+      "num_tokens": 52644883.0,
+      "reward": 1.2455357313156128,
+      "reward_std": 0.29382893443107605,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25,
+      "rewards/curriculum_aware_reward_fn/std": 0.4823506772518158,
+      "step": 122
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1113.0,
+      "completions/max_terminated_length": 1113.0,
+      "completions/mean_length": 324.8370666503906,
+      "completions/mean_terminated_length": 324.8370666503906,
+      "completions/min_length": 112.0,
+      "completions/min_terminated_length": 112.0,
+      "epoch": 0.5074780814853017,
+      "grad_norm": 0.28405648469924927,
+      "kl": 0.01723480224609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0218,
+      "num_tokens": 53072097.0,
+      "reward": 1.1540179252624512,
+      "reward_std": 0.2212349772453308,
+      "rewards/code_format_reward/mean": 0.9866071343421936,
+      "rewards/code_format_reward/std": 0.11507844179868698,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457,
+      "step": 123
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 726.0,
+      "completions/max_terminated_length": 726.0,
+      "completions/mean_length": 294.9910888671875,
+      "completions/mean_terminated_length": 294.9910888671875,
+      "completions/min_length": 78.0,
+      "completions/min_terminated_length": 78.0,
+      "epoch": 0.5116039195461578,
+      "grad_norm": 0.25819990038871765,
+      "kl": 0.0186614990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0036,
+      "num_tokens": 53460320.0,
+      "reward": 1.149553656578064,
+      "reward_std": 0.20559635758399963,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184,
+      "step": 124
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1141.0,
+      "completions/max_terminated_length": 1141.0,
+      "completions/mean_length": 307.7723388671875,
+      "completions/mean_terminated_length": 307.7723388671875,
+      "completions/min_length": 98.0,
+      "completions/min_terminated_length": 98.0,
+      "epoch": 0.5157297576070139,
+      "grad_norm": 0.29429891705513,
+      "kl": 0.0176544189453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0331,
+      "num_tokens": 53860688.0,
+      "reward": 1.196428656578064,
+      "reward_std": 0.22343392670154572,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843171834946,
+      "rewards/curriculum_aware_reward_fn/mean": 0.203125,
+      "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365,
+      "step": 125
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 884.0,
+      "completions/max_terminated_length": 884.0,
+      "completions/mean_length": 293.2321472167969,
+      "completions/mean_terminated_length": 293.2321472167969,
+      "completions/min_length": 59.0,
+      "completions/min_terminated_length": 59.0,
+      "epoch": 0.51985559566787,
+      "grad_norm": 0.30082517862319946,
+      "kl": 0.0195159912109375,
+      "learning_rate": 1e-06,
+      "loss": -0.0047,
+      "num_tokens": 54251469.0,
+      "reward": 1.212053656578064,
+      "reward_std": 0.21257728338241577,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2209821492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.4153723120689392,
+      "step": 126
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2435.0,
+      "completions/max_terminated_length": 2435.0,
+      "completions/mean_length": 318.7946472167969,
+      "completions/mean_terminated_length": 318.7946472167969,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 100.0,
+      "epoch": 0.5239814337287262,
+      "grad_norm": 0.27134010195732117,
+      "kl": 0.01700592041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0026,
+      "num_tokens": 54667695.0,
+      "reward": 1.1383929252624512,
+      "reward_std": 0.2325977087020874,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.3566471338272095,
+      "step": 127
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1320.0,
+      "completions/max_terminated_length": 1320.0,
+      "completions/mean_length": 321.45538330078125,
+      "completions/mean_terminated_length": 321.45538330078125,
+      "completions/min_length": 104.0,
+      "completions/min_terminated_length": 104.0,
+      "epoch": 0.5281072717895823,
+      "grad_norm": 0.23834328353405,
+      "kl": 0.01728057861328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0117,
+      "num_tokens": 55078358.0,
+      "reward": 1.1607143878936768,
+      "reward_std": 0.15348079800605774,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928,
+      "step": 128
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1619.0,
+      "completions/max_terminated_length": 1619.0,
+      "completions/mean_length": 337.97991943359375,
+      "completions/mean_terminated_length": 337.97991943359375,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "epoch": 0.5322331098504384,
+      "grad_norm": 0.24708496034145355,
+      "kl": 0.0186614990234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0155,
+      "num_tokens": 55527742.0,
+      "reward": 1.127232313156128,
+      "reward_std": 0.2065713256597519,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133,
+      "step": 129
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.004464285714285698,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1139.0,
+      "completions/mean_length": 329.22991943359375,
+      "completions/mean_terminated_length": 312.3385925292969,
+      "completions/min_length": 91.0,
+      "completions/min_terminated_length": 91.0,
+      "epoch": 0.5363589479112945,
+      "grad_norm": 0.2480604648590088,
+      "kl": 0.020965576171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0453,
+      "num_tokens": 55941881.0,
+      "reward": 1.1696429252624512,
+      "reward_std": 0.2080267071723938,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.41156184673309326,
+      "step": 130
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1275.0,
+      "completions/max_terminated_length": 1275.0,
+      "completions/mean_length": 324.04241943359375,
+      "completions/mean_terminated_length": 324.04241943359375,
+      "completions/min_length": 85.0,
+      "completions/min_terminated_length": 85.0,
+      "epoch": 0.5404847859721505,
+      "grad_norm": 0.27031761407852173,
+      "kl": 0.0172119140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0045,
+      "num_tokens": 56365057.0,
+      "reward": 1.1607143878936768,
+      "reward_std": 0.20167233049869537,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3676777780056,
+      "step": 131
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1266.0,
+      "completions/max_terminated_length": 1266.0,
+      "completions/mean_length": 332.5133972167969,
+      "completions/mean_terminated_length": 332.5133972167969,
+      "completions/min_length": 105.0,
+      "completions/min_terminated_length": 105.0,
+      "epoch": 0.5446106240330068,
+      "grad_norm": 0.2554526627063751,
+      "kl": 0.0162353515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0179,
+      "num_tokens": 56796550.0,
+      "reward": 1.1629464626312256,
+      "reward_std": 0.19279412925243378,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928,
+      "step": 132
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 998.0,
+      "completions/max_terminated_length": 998.0,
+      "completions/mean_length": 320.7076110839844,
+      "completions/mean_terminated_length": 320.7076110839844,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "epoch": 0.5487364620938628,
+      "grad_norm": 0.27639445662498474,
+      "kl": 0.0175933837890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0137,
+      "num_tokens": 57218694.0,
+      "reward": 1.140625,
+      "reward_std": 0.2198539674282074,
+      "rewards/code_format_reward/mean": 0.9888392686843872,
+      "rewards/code_format_reward/std": 0.10517053306102753,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184,
+      "step": 133
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1035.0,
+      "completions/max_terminated_length": 1035.0,
+      "completions/mean_length": 330.8214416503906,
+      "completions/mean_terminated_length": 330.8214416503906,
+      "completions/min_length": 114.0,
+      "completions/min_terminated_length": 114.0,
+      "epoch": 0.5528623001547189,
+      "grad_norm": 0.27146339416503906,
+      "kl": 0.01773834228515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0051,
+      "num_tokens": 57637410.0,
+      "reward": 1.1383929252624512,
+      "reward_std": 0.19378496706485748,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.140625,
+      "rewards/curriculum_aware_reward_fn/std": 0.3480229377746582,
+      "step": 134
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1298.0,
+      "completions/mean_length": 362.9352722167969,
+      "completions/mean_terminated_length": 354.5838928222656,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "epoch": 0.556988138215575,
+      "grad_norm": 0.2487378716468811,
+      "kl": 0.017333984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0321,
+      "num_tokens": 58083673.0,
+      "reward": 1.1696429252624512,
+      "reward_std": 0.20776358246803284,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843171834946,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1763392835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3873537480831146,
+      "step": 135
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1828.0,
+      "completions/max_terminated_length": 1828.0,
+      "completions/mean_length": 327.7410888671875,
+      "completions/mean_terminated_length": 327.7410888671875,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "epoch": 0.5611139762764311,
+      "grad_norm": 0.27792710065841675,
+      "kl": 0.019989013671875,
+      "learning_rate": 1e-06,
+      "loss": 0.0105,
+      "num_tokens": 58492336.0,
+      "reward": 1.1919643878936768,
+      "reward_std": 0.24865475296974182,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.39774051308631897,
+      "step": 136
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1106.0,
+      "completions/max_terminated_length": 1106.0,
+      "completions/mean_length": 347.3214416503906,
+      "completions/mean_terminated_length": 347.3214416503906,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "epoch": 0.5652398143372873,
+      "grad_norm": 0.22186486423015594,
+      "kl": 0.018646240234375,
+      "learning_rate": 1e-06,
+      "loss": 0.0126,
+      "num_tokens": 58913446.0,
+      "reward": 1.1540179252624512,
+      "reward_std": 0.1549137532711029,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.36349809169769287,
+      "step": 137
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1071.0,
+      "completions/max_terminated_length": 1071.0,
+      "completions/mean_length": 346.89288330078125,
+      "completions/mean_terminated_length": 346.89288330078125,
+      "completions/min_length": 114.0,
+      "completions/min_terminated_length": 114.0,
+      "epoch": 0.5693656523981434,
+      "grad_norm": 0.2620648443698883,
+      "kl": 0.0197601318359375,
+      "learning_rate": 1e-06,
+      "loss": -0.0008,
+      "num_tokens": 59344787.0,
+      "reward": 1.149553656578064,
+      "reward_std": 0.19237443804740906,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.36324387788772583,
+      "step": 138
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 858.0,
+      "completions/max_terminated_length": 858.0,
+      "completions/mean_length": 325.52679443359375,
+      "completions/mean_terminated_length": 325.52679443359375,
+      "completions/min_length": 112.0,
+      "completions/min_terminated_length": 112.0,
+      "epoch": 0.5734914904589995,
+      "grad_norm": 0.25917235016822815,
+      "kl": 0.0196533203125,
+      "learning_rate": 1e-06,
+      "loss": 0.0129,
+      "num_tokens": 59741384.0,
+      "reward": 1.171875238418579,
+      "reward_std": 0.23336145281791687,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1763392835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3815346360206604,
+      "step": 139
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3192.0,
+      "completions/max_terminated_length": 3192.0,
+      "completions/mean_length": 350.24554443359375,
+      "completions/mean_terminated_length": 350.24554443359375,
+      "completions/min_length": 144.0,
+      "completions/min_terminated_length": 144.0,
+      "epoch": 0.5776173285198556,
+      "grad_norm": 0.2704315781593323,
+      "kl": 0.01824951171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0103,
+      "num_tokens": 60173646.0,
+      "reward": 1.165178656578064,
+      "reward_std": 0.20555807650089264,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457,
+      "step": 140
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 889.0,
+      "completions/max_terminated_length": 889.0,
+      "completions/mean_length": 348.03350830078125,
+      "completions/mean_terminated_length": 348.03350830078125,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "epoch": 0.5817431665807117,
+      "grad_norm": 0.24649009108543396,
+      "kl": 0.01763916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0198,
+      "num_tokens": 60600283.0,
+      "reward": 1.1450893878936768,
+      "reward_std": 0.20083805918693542,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.352584570646286,
+      "step": 141
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1494.0,
+      "completions/max_terminated_length": 1494.0,
+      "completions/mean_length": 369.7388610839844,
+      "completions/mean_terminated_length": 369.7388610839844,
+      "completions/min_length": 119.0,
+      "completions/min_terminated_length": 119.0,
+      "epoch": 0.5858690046415678,
+      "grad_norm": 0.2548430562019348,
+      "kl": 0.01636505126953125,
+      "learning_rate": 1e-06,
+      "loss": -0.0051,
+      "num_tokens": 61055684.0,
+      "reward": 1.1696429252624512,
+      "reward_std": 0.2090558409690857,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.3816458284854889,
+      "step": 142
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 992.0,
+      "completions/max_terminated_length": 992.0,
+      "completions/mean_length": 353.56475830078125,
+      "completions/mean_terminated_length": 353.56475830078125,
+      "completions/min_length": 129.0,
+      "completions/min_terminated_length": 129.0,
+      "epoch": 0.589994842702424,
+      "grad_norm": 0.2363528460264206,
+      "kl": 0.01692962646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0015,
+      "num_tokens": 61495544.0,
+      "reward": 1.149553656578064,
+      "reward_std": 0.20312771201133728,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.35703200101852417,
+      "step": 143
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 924.0,
+      "completions/max_terminated_length": 924.0,
+      "completions/mean_length": 346.1004638671875,
+      "completions/mean_terminated_length": 346.1004638671875,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "epoch": 0.5941206807632801,
+      "grad_norm": 0.25129303336143494,
+      "kl": 0.01854705810546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0096,
+      "num_tokens": 61909660.0,
+      "reward": 1.1584821939468384,
+      "reward_std": 0.20729246735572815,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194,
+      "step": 144
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1139.0,
+      "completions/max_terminated_length": 1139.0,
+      "completions/mean_length": 369.67413330078125,
+      "completions/mean_terminated_length": 369.67413330078125,
+      "completions/min_length": 139.0,
+      "completions/min_terminated_length": 139.0,
+      "epoch": 0.5982465188241362,
+      "grad_norm": 0.2561042606830597,
+      "kl": 0.01654052734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0039,
+      "num_tokens": 62344613.0,
+      "reward": 1.140625,
+      "reward_std": 0.22326858341693878,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361,
+      "step": 145
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 2887.0,
+      "completions/mean_length": 360.9687805175781,
+      "completions/mean_terminated_length": 352.61297607421875,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "epoch": 0.6023723568849922,
+      "grad_norm": 0.25807270407676697,
+      "kl": 0.0168609619140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0234,
+      "num_tokens": 62762009.0,
+      "reward": 1.2142857313156128,
+      "reward_std": 0.2551623582839966,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.4334910213947296,
+      "step": 146
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1202.0,
+      "completions/mean_length": 378.3750305175781,
+      "completions/mean_terminated_length": 370.05816650390625,
+      "completions/min_length": 127.0,
+      "completions/min_terminated_length": 127.0,
+      "epoch": 0.6064981949458483,
+      "grad_norm": 0.3243965208530426,
+      "kl": 0.0496826171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0411,
+      "num_tokens": 63197325.0,
+      "reward": 1.1540179252624512,
+      "reward_std": 0.21813544631004333,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194,
+      "step": 147
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1059.0,
+      "completions/max_terminated_length": 1059.0,
+      "completions/mean_length": 354.47991943359375,
+      "completions/mean_terminated_length": 354.47991943359375,
+      "completions/min_length": 90.0,
+      "completions/min_terminated_length": 90.0,
+      "epoch": 0.6106240330067045,
+      "grad_norm": 0.27332666516304016,
+      "kl": 0.0201263427734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0223,
+      "num_tokens": 63627732.0,
+      "reward": 1.140625,
+      "reward_std": 0.24266810715198517,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3525845408439636,
+      "step": 148
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1062.0,
+      "completions/max_terminated_length": 1062.0,
+      "completions/mean_length": 355.8035888671875,
+      "completions/mean_terminated_length": 355.8035888671875,
+      "completions/min_length": 97.0,
+      "completions/min_terminated_length": 97.0,
+      "epoch": 0.6147498710675606,
+      "grad_norm": 0.24161125719547272,
+      "kl": 0.016448974609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0049,
+      "num_tokens": 64075567.0,
+      "reward": 1.1473214626312256,
+      "reward_std": 0.21147961914539337,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437,
+      "step": 149
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 967.0,
+      "completions/max_terminated_length": 967.0,
+      "completions/mean_length": 390.1383972167969,
+      "completions/mean_terminated_length": 390.1383972167969,
+      "completions/min_length": 140.0,
+      "completions/min_terminated_length": 140.0,
+      "epoch": 0.6188757091284167,
+      "grad_norm": 0.19339683651924133,
+      "kl": 0.01575469970703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0076,
+      "num_tokens": 64536040.0,
+      "reward": 1.118303656578064,
+      "reward_std": 0.16575203835964203,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1205357164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.3259509205818176,
+      "step": 150
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1122.0,
+      "completions/max_terminated_length": 1122.0,
+      "completions/mean_length": 346.2812805175781,
+      "completions/mean_terminated_length": 346.2812805175781,
+      "completions/min_length": 102.0,
+      "completions/min_terminated_length": 102.0,
+      "epoch": 0.6230015471892728,
+      "grad_norm": 0.2566029727458954,
+      "kl": 0.0163116455078125,
+      "learning_rate": 1e-06,
+      "loss": 0.021,
+      "num_tokens": 64954262.0,
+      "reward": 1.1629464626312256,
+      "reward_std": 0.23038895428180695,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457,
+      "step": 151
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 813.0,
+      "completions/max_terminated_length": 813.0,
+      "completions/mean_length": 344.9687805175781,
+      "completions/mean_terminated_length": 344.9687805175781,
+      "completions/min_length": 83.0,
+      "completions/min_terminated_length": 83.0,
+      "epoch": 0.6271273852501289,
+      "grad_norm": 0.2446659654378891,
+      "kl": 0.01611328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0024,
+      "num_tokens": 65365750.0,
+      "reward": 1.1473214626312256,
+      "reward_std": 0.17018462717533112,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184,
+      "step": 152
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1171.0,
+      "completions/max_terminated_length": 1171.0,
+      "completions/mean_length": 348.5245666503906,
+      "completions/mean_terminated_length": 348.5245666503906,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 0.631253223310985,
+      "grad_norm": 0.28293153643608093,
+      "kl": 0.03037261962890625,
+      "learning_rate": 1e-06,
+      "loss": -0.0008,
+      "num_tokens": 65796226.0,
+      "reward": 1.1875,
+      "reward_std": 0.25763100385665894,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3942854404449463,
+      "step": 153
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 898.0,
+      "completions/max_terminated_length": 898.0,
+      "completions/mean_length": 370.1071472167969,
+      "completions/mean_terminated_length": 370.1071472167969,
+      "completions/min_length": 134.0,
+      "completions/min_terminated_length": 134.0,
+      "epoch": 0.6353790613718412,
+      "grad_norm": 0.24039463698863983,
+      "kl": 0.015869140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0092,
+      "num_tokens": 66231975.0,
+      "reward": 1.1517857313156128,
+      "reward_std": 0.21372443437576294,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.36136937141418457,
+      "step": 154
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1067.0,
+      "completions/max_terminated_length": 1067.0,
+      "completions/mean_length": 394.8258972167969,
+      "completions/mean_terminated_length": 394.8258972167969,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "epoch": 0.6395048994326973,
+      "grad_norm": 0.1961672157049179,
+      "kl": 0.01488494873046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0085,
+      "num_tokens": 66691440.0,
+      "reward": 1.140625,
+      "reward_std": 0.12813948094844818,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1428571492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.3503182828426361,
+      "step": 155
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1462.0,
+      "completions/max_terminated_length": 1462.0,
+      "completions/mean_length": 352.5848388671875,
+      "completions/mean_terminated_length": 352.5848388671875,
+      "completions/min_length": 125.0,
+      "completions/min_terminated_length": 125.0,
+      "epoch": 0.6436307374935534,
+      "grad_norm": 0.25957784056663513,
+      "kl": 0.0162506103515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0113,
+      "num_tokens": 67124834.0,
+      "reward": 1.203125,
+      "reward_std": 0.219389408826828,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2053571492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.40441396832466125,
+      "step": 156
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1036.0,
+      "completions/max_terminated_length": 1036.0,
+      "completions/mean_length": 358.00225830078125,
+      "completions/mean_terminated_length": 358.00225830078125,
+      "completions/min_length": 104.0,
+      "completions/min_terminated_length": 104.0,
+      "epoch": 0.6477565755544095,
+      "grad_norm": 0.24108652770519257,
+      "kl": 0.01555633544921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0103,
+      "num_tokens": 67562832.0,
+      "reward": 1.1584821939468384,
+      "reward_std": 0.2208447903394699,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.3716694116592407,
+      "step": 157
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1446.0,
+      "completions/max_terminated_length": 1446.0,
+      "completions/mean_length": 366.8883972167969,
+      "completions/mean_terminated_length": 366.8883972167969,
+      "completions/min_length": 98.0,
+      "completions/min_terminated_length": 98.0,
+      "epoch": 0.6518824136152656,
+      "grad_norm": 0.23085784912109375,
+      "kl": 0.01419830322265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0009,
+      "num_tokens": 67971421.0,
+      "reward": 1.1473214626312256,
+      "reward_std": 0.17780883610248566,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1473214328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3548222482204437,
+      "step": 158
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1530.0,
+      "completions/max_terminated_length": 1530.0,
+      "completions/mean_length": 374.72991943359375,
+      "completions/mean_terminated_length": 374.72991943359375,
+      "completions/min_length": 136.0,
+      "completions/min_terminated_length": 136.0,
+      "epoch": 0.6560082516761218,
+      "grad_norm": 0.2072339653968811,
+      "kl": 0.01485443115234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0055,
+      "num_tokens": 68419713.0,
+      "reward": 1.1607143878936768,
+      "reward_std": 0.156369149684906,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928,
+      "step": 159
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1529.0,
+      "completions/max_terminated_length": 1529.0,
+      "completions/mean_length": 391.3415222167969,
+      "completions/mean_terminated_length": 391.3415222167969,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "epoch": 0.6601340897369778,
+      "grad_norm": 0.22335219383239746,
+      "kl": 0.018585205078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0129,
+      "num_tokens": 68894900.0,
+      "reward": 1.1629464626312256,
+      "reward_std": 0.20225736498832703,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457,
+      "step": 160
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1236.0,
+      "completions/max_terminated_length": 1236.0,
+      "completions/mean_length": 385.0826110839844,
+      "completions/mean_terminated_length": 385.0826110839844,
+      "completions/min_length": 152.0,
+      "completions/min_terminated_length": 152.0,
+      "epoch": 0.6642599277978339,
+      "grad_norm": 0.22281871736049652,
+      "kl": 0.01507568359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0098,
+      "num_tokens": 69340053.0,
+      "reward": 1.1674107313156128,
+      "reward_std": 0.19550350308418274,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276,
+      "step": 161
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 944.0,
+      "completions/max_terminated_length": 944.0,
+      "completions/mean_length": 363.0000305175781,
+      "completions/mean_terminated_length": 363.0000305175781,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 0.66838576585869,
+      "grad_norm": 0.23224228620529175,
+      "kl": 0.0154876708984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0021,
+      "num_tokens": 69771568.0,
+      "reward": 1.1361607313156128,
+      "reward_std": 0.20033742487430573,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1383928507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.34569787979125977,
+      "step": 162
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1050.0,
+      "completions/max_terminated_length": 1050.0,
+      "completions/mean_length": 361.09600830078125,
+      "completions/mean_terminated_length": 361.09600830078125,
+      "completions/min_length": 136.0,
+      "completions/min_terminated_length": 136.0,
+      "epoch": 0.6725116039195461,
+      "grad_norm": 0.24737422168254852,
+      "kl": 0.01534271240234375,
+      "learning_rate": 1e-06,
+      "loss": -0.0084,
+      "num_tokens": 70200179.0,
+      "reward": 1.1629464626312256,
+      "reward_std": 0.22474639117717743,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1651785671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.37175676226615906,
+      "step": 163
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1104.0,
+      "completions/max_terminated_length": 1104.0,
+      "completions/mean_length": 351.03350830078125,
+      "completions/mean_terminated_length": 351.03350830078125,
+      "completions/min_length": 128.0,
+      "completions/min_terminated_length": 128.0,
+      "epoch": 0.6766374419804023,
+      "grad_norm": 0.2552102208137512,
+      "kl": 0.01573944091796875,
+      "learning_rate": 1e-06,
+      "loss": 0.021,
+      "num_tokens": 70628041.0,
+      "reward": 1.1450893878936768,
+      "reward_std": 0.1989629715681076,
+      "rewards/code_format_reward/mean": 0.9888392686843872,
+      "rewards/code_format_reward/std": 0.10517053306102753,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.36349809169769287,
+      "step": 164
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 981.0,
+      "completions/max_terminated_length": 981.0,
+      "completions/mean_length": 365.0535888671875,
+      "completions/mean_terminated_length": 365.0535888671875,
+      "completions/min_length": 123.0,
+      "completions/min_terminated_length": 123.0,
+      "epoch": 0.6807632800412584,
+      "grad_norm": 0.2395404428243637,
+      "kl": 0.0143280029296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0094,
+      "num_tokens": 71071298.0,
+      "reward": 1.1875,
+      "reward_std": 0.22171072661876678,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3960230052471161,
+      "step": 165
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 957.0,
+      "completions/max_terminated_length": 957.0,
+      "completions/mean_length": 362.4844055175781,
+      "completions/mean_terminated_length": 362.4844055175781,
+      "completions/min_length": 147.0,
+      "completions/min_terminated_length": 147.0,
+      "epoch": 0.6848891181021145,
+      "grad_norm": 0.21977753937244415,
+      "kl": 0.01514434814453125,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 71518906.0,
+      "reward": 1.149553656578064,
+      "reward_std": 0.1930348426103592,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1540178507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.36136940121650696,
+      "step": 166
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 971.0,
+      "completions/max_terminated_length": 971.0,
+      "completions/mean_length": 342.22991943359375,
+      "completions/mean_terminated_length": 342.22991943359375,
+      "completions/min_length": 121.0,
+      "completions/min_terminated_length": 121.0,
+      "epoch": 0.6890149561629706,
+      "grad_norm": 0.2827470004558563,
+      "kl": 0.015411376953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0304,
+      "num_tokens": 71938603.0,
+      "reward": 1.1941964626312256,
+      "reward_std": 0.26317548751831055,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.4088349938392639,
+      "step": 167
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1074.0,
+      "completions/max_terminated_length": 1074.0,
+      "completions/mean_length": 384.7901916503906,
+      "completions/mean_terminated_length": 384.7901916503906,
+      "completions/min_length": 98.0,
+      "completions/min_terminated_length": 98.0,
+      "epoch": 0.6931407942238267,
+      "grad_norm": 0.23986679315567017,
+      "kl": 0.015289306640625,
+      "learning_rate": 1e-06,
+      "loss": 0.014,
+      "num_tokens": 72401657.0,
+      "reward": 1.0982143878936768,
+      "reward_std": 0.18790170550346375,
+      "rewards/code_format_reward/mean": 0.9888392686843872,
+      "rewards/code_format_reward/std": 0.10517053306102753,
+      "rewards/curriculum_aware_reward_fn/mean": 0.109375,
+      "rewards/curriculum_aware_reward_fn/std": 0.3124580383300781,
+      "step": 168
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1182.0,
+      "completions/max_terminated_length": 1182.0,
+      "completions/mean_length": 366.13616943359375,
+      "completions/mean_terminated_length": 366.13616943359375,
+      "completions/min_length": 102.0,
+      "completions/min_terminated_length": 102.0,
+      "epoch": 0.6972666322846828,
+      "grad_norm": 0.2206612080335617,
+      "kl": 0.013885498046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0084,
+      "num_tokens": 72851139.0,
+      "reward": 1.171875,
+      "reward_std": 0.1650066077709198,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.171875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775,
+      "step": 169
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1107.0,
+      "completions/max_terminated_length": 1107.0,
+      "completions/mean_length": 361.0625305175781,
+      "completions/mean_terminated_length": 361.0625305175781,
+      "completions/min_length": 114.0,
+      "completions/min_terminated_length": 114.0,
+      "epoch": 0.701392470345539,
+      "grad_norm": 0.19477325677871704,
+      "kl": 0.01851654052734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0111,
+      "num_tokens": 73305089.0,
+      "reward": 1.15625,
+      "reward_std": 0.1615247130393982,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1584821492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.36560073494911194,
+      "step": 170
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1212.0,
+      "completions/max_terminated_length": 1212.0,
+      "completions/mean_length": 367.2544860839844,
+      "completions/mean_terminated_length": 367.2544860839844,
+      "completions/min_length": 116.0,
+      "completions/min_terminated_length": 116.0,
+      "epoch": 0.7055183084063951,
+      "grad_norm": 0.25459757447242737,
+      "kl": 0.0151519775390625,
+      "learning_rate": 1e-06,
+      "loss": -0.0083,
+      "num_tokens": 73743438.0,
+      "reward": 1.149553656578064,
+      "reward_std": 0.19232958555221558,
+      "rewards/code_format_reward/mean": 0.9888392686843872,
+      "rewards/code_format_reward/std": 0.10517053306102753,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3676777780056,
+      "step": 171
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 943.0,
+      "completions/max_terminated_length": 943.0,
+      "completions/mean_length": 346.82366943359375,
+      "completions/mean_terminated_length": 346.82366943359375,
+      "completions/min_length": 132.0,
+      "completions/min_terminated_length": 132.0,
+      "epoch": 0.7096441464672512,
+      "grad_norm": 0.26766178011894226,
+      "kl": 0.0182037353515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0094,
+      "num_tokens": 74159257.0,
+      "reward": 1.1629464626312256,
+      "reward_std": 0.24092715978622437,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.171875,
+      "rewards/curriculum_aware_reward_fn/std": 0.38935965299606323,
+      "step": 172
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 843.0,
+      "completions/max_terminated_length": 843.0,
+      "completions/mean_length": 340.3258972167969,
+      "completions/mean_terminated_length": 340.3258972167969,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "epoch": 0.7137699845281072,
+      "grad_norm": 0.25564658641815186,
+      "kl": 0.0155181884765625,
+      "learning_rate": 1e-06,
+      "loss": 0.014,
+      "num_tokens": 74568644.0,
+      "reward": 1.1741071939468384,
+      "reward_std": 0.2240411341190338,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3834212124347687,
+      "step": 173
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1173.0,
+      "completions/max_terminated_length": 1173.0,
+      "completions/mean_length": 322.3035888671875,
+      "completions/mean_terminated_length": 322.3035888671875,
+      "completions/min_length": 108.0,
+      "completions/min_terminated_length": 108.0,
+      "epoch": 0.7178958225889633,
+      "grad_norm": 0.2987499535083771,
+      "kl": 0.02458953857421875,
+      "learning_rate": 1e-06,
+      "loss": 0.02,
+      "num_tokens": 74979603.0,
+      "reward": 1.2053571939468384,
+      "reward_std": 0.27893462777137756,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2120535671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.4146503508090973,
+      "step": 174
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1080.0,
+      "completions/mean_length": 333.75,
+      "completions/mean_terminated_length": 325.3333435058594,
+      "completions/min_length": 113.0,
+      "completions/min_terminated_length": 113.0,
+      "epoch": 0.7220216606498195,
+      "grad_norm": 0.2896713316440582,
+      "kl": 0.0155487060546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0192,
+      "num_tokens": 75427687.0,
+      "reward": 1.1741071939468384,
+      "reward_std": 0.23012901842594147,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843171834946,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1808035671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.3852855861186981,
+      "step": 175
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 827.0,
+      "completions/max_terminated_length": 827.0,
+      "completions/mean_length": 333.93304443359375,
+      "completions/mean_terminated_length": 333.93304443359375,
+      "completions/min_length": 130.0,
+      "completions/min_terminated_length": 130.0,
+      "epoch": 0.7261474987106756,
+      "grad_norm": 0.26385900378227234,
+      "kl": 0.01572418212890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0158,
+      "num_tokens": 75846166.0,
+      "reward": 1.180803656578064,
+      "reward_std": 0.24136929214000702,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.387128084897995,
+      "step": 176
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1144.0,
+      "completions/max_terminated_length": 1144.0,
+      "completions/mean_length": 338.9598388671875,
+      "completions/mean_terminated_length": 338.9598388671875,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "epoch": 0.7302733367715317,
+      "grad_norm": 0.2855651080608368,
+      "kl": 0.01647186279296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0103,
+      "num_tokens": 76266400.0,
+      "reward": 1.1629464626312256,
+      "reward_std": 0.24424399435520172,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349845170975,
+      "rewards/curriculum_aware_reward_fn/mean": 0.171875,
+      "rewards/curriculum_aware_reward_fn/std": 0.40068626403808594,
+      "step": 177
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1064.0,
+      "completions/max_terminated_length": 1064.0,
+      "completions/mean_length": 311.58929443359375,
+      "completions/mean_terminated_length": 311.58929443359375,
+      "completions/min_length": 101.0,
+      "completions/min_terminated_length": 101.0,
+      "epoch": 0.7343991748323878,
+      "grad_norm": 0.26801854372024536,
+      "kl": 0.02155303955078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 76686384.0,
+      "reward": 1.1852679252624512,
+      "reward_std": 0.2134612649679184,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.39252743124961853,
+      "step": 178
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 813.0,
+      "completions/max_terminated_length": 813.0,
+      "completions/mean_length": 304.47991943359375,
+      "completions/mean_terminated_length": 304.47991943359375,
+      "completions/min_length": 118.0,
+      "completions/min_terminated_length": 118.0,
+      "epoch": 0.7385250128932439,
+      "grad_norm": 0.35143211483955383,
+      "kl": 0.03144073486328125,
+      "learning_rate": 1e-06,
+      "loss": 0.0237,
+      "num_tokens": 77071872.0,
+      "reward": 1.2566964626312256,
+      "reward_std": 0.30517578125,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2633928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.44096609950065613,
+      "step": 179
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1169.0,
+      "completions/max_terminated_length": 1169.0,
+      "completions/mean_length": 341.2544860839844,
+      "completions/mean_terminated_length": 341.2544860839844,
+      "completions/min_length": 81.0,
+      "completions/min_terminated_length": 81.0,
+      "epoch": 0.7426508509541001,
+      "grad_norm": 0.24900677800178528,
+      "kl": 0.01654052734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0226,
+      "num_tokens": 77484440.0,
+      "reward": 1.1473214626312256,
+      "reward_std": 0.21222974359989166,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184,
+      "step": 180
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1099.0,
+      "completions/max_terminated_length": 1099.0,
+      "completions/mean_length": 332.39288330078125,
+      "completions/mean_terminated_length": 332.39288330078125,
+      "completions/min_length": 102.0,
+      "completions/min_terminated_length": 102.0,
+      "epoch": 0.7467766890149562,
+      "grad_norm": 0.2445136457681656,
+      "kl": 0.01773834228515625,
+      "learning_rate": 1e-06,
+      "loss": 0.0204,
+      "num_tokens": 77914906.0,
+      "reward": 1.165178656578064,
+      "reward_std": 0.2167556881904602,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.171875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775,
+      "step": 181
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1067.0,
+      "completions/max_terminated_length": 1067.0,
+      "completions/mean_length": 318.69866943359375,
+      "completions/mean_terminated_length": 318.69866943359375,
+      "completions/min_length": 109.0,
+      "completions/min_terminated_length": 109.0,
+      "epoch": 0.7509025270758123,
+      "grad_norm": 0.23836004734039307,
+      "kl": 0.023773193359375,
+      "learning_rate": 1e-06,
+      "loss": 0.01,
+      "num_tokens": 78312847.0,
+      "reward": 1.1473214626312256,
+      "reward_std": 0.18241570889949799,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.3592142164707184,
+      "step": 182
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 767.0,
+      "completions/max_terminated_length": 767.0,
+      "completions/mean_length": 301.0245666503906,
+      "completions/mean_terminated_length": 301.0245666503906,
+      "completions/min_length": 98.0,
+      "completions/min_terminated_length": 98.0,
+      "epoch": 0.7550283651366684,
+      "grad_norm": 0.2632372975349426,
+      "kl": 0.0267791748046875,
+      "learning_rate": 1e-06,
+      "loss": 0.013,
+      "num_tokens": 78707133.0,
+      "reward": 1.1584821939468384,
+      "reward_std": 0.18409815430641174,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3697296679019928,
+      "step": 183
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 990.0,
+      "completions/max_terminated_length": 990.0,
+      "completions/mean_length": 315.3102722167969,
+      "completions/mean_terminated_length": 315.3102722167969,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "epoch": 0.7591542031975245,
+      "grad_norm": 0.25678277015686035,
+      "kl": 0.0182647705078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0068,
+      "num_tokens": 79133075.0,
+      "reward": 1.1741071939468384,
+      "reward_std": 0.1857805848121643,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3834211826324463,
+      "step": 184
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1141.0,
+      "completions/max_terminated_length": 1141.0,
+      "completions/mean_length": 314.8348388671875,
+      "completions/mean_terminated_length": 314.8348388671875,
+      "completions/min_length": 107.0,
+      "completions/min_terminated_length": 107.0,
+      "epoch": 0.7632800412583806,
+      "grad_norm": 0.26571905612945557,
+      "kl": 0.018341064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0054,
+      "num_tokens": 79543950.0,
+      "reward": 1.1383929252624512,
+      "reward_std": 0.21120063960552216,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1450892835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.352584570646286,
+      "step": 185
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1608.0,
+      "completions/max_terminated_length": 1608.0,
+      "completions/mean_length": 296.62725830078125,
+      "completions/mean_terminated_length": 296.62725830078125,
+      "completions/min_length": 106.0,
+      "completions/min_terminated_length": 106.0,
+      "epoch": 0.7674058793192368,
+      "grad_norm": 0.3173041343688965,
+      "kl": 0.0209197998046875,
+      "learning_rate": 1e-06,
+      "loss": 0.03,
+      "num_tokens": 79936861.0,
+      "reward": 1.2388393878936768,
+      "reward_std": 0.26434528827667236,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2410714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.43853598833084106,
+      "step": 186
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 782.0,
+      "completions/max_terminated_length": 782.0,
+      "completions/mean_length": 306.97100830078125,
+      "completions/mean_terminated_length": 306.97100830078125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "epoch": 0.7715317173800929,
+      "grad_norm": 0.2590916156768799,
+      "kl": 0.0192108154296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0001,
+      "num_tokens": 80341945.0,
+      "reward": 1.1227679252624512,
+      "reward_std": 0.2021593153476715,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996,
+      "step": 187
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 697.0,
+      "completions/max_terminated_length": 697.0,
+      "completions/mean_length": 298.37054443359375,
+      "completions/mean_terminated_length": 298.37054443359375,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 0.7756575554409489,
+      "grad_norm": 0.26516759395599365,
+      "kl": 0.021209716796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0153,
+      "num_tokens": 80738444.0,
+      "reward": 1.1227679252624512,
+      "reward_std": 0.20725637674331665,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1272321492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.3336053788661957,
+      "step": 188
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 807.0,
+      "completions/max_terminated_length": 807.0,
+      "completions/mean_length": 292.6004638671875,
+      "completions/mean_terminated_length": 292.6004638671875,
+      "completions/min_length": 97.0,
+      "completions/min_terminated_length": 97.0,
+      "epoch": 0.779783393501805,
+      "grad_norm": 0.30099207162857056,
+      "kl": 0.019683837890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0113,
+      "num_tokens": 81127619.0,
+      "reward": 1.234375,
+      "reward_std": 0.2714380919933319,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843171834946,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2410714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.42821168899536133,
+      "step": 189
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 819.0,
+      "completions/max_terminated_length": 819.0,
+      "completions/mean_length": 284.65850830078125,
+      "completions/mean_terminated_length": 284.65850830078125,
+      "completions/min_length": 73.0,
+      "completions/min_terminated_length": 73.0,
+      "epoch": 0.7839092315626611,
+      "grad_norm": 0.2730914354324341,
+      "kl": 0.0259552001953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0002,
+      "num_tokens": 81507881.0,
+      "reward": 1.1897321939468384,
+      "reward_std": 0.23237062990665436,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1941964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3960230052471161,
+      "step": 190
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 822.0,
+      "completions/max_terminated_length": 822.0,
+      "completions/mean_length": 295.046875,
+      "completions/mean_terminated_length": 295.046875,
+      "completions/min_length": 94.0,
+      "completions/min_terminated_length": 94.0,
+      "epoch": 0.7880350696235173,
+      "grad_norm": 0.2663305997848511,
+      "kl": 0.021148681640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0009,
+      "num_tokens": 81889192.0,
+      "reward": 1.2075893878936768,
+      "reward_std": 0.1989629715681076,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2075892835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.4060344398021698,
+      "step": 191
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1043.0,
+      "completions/max_terminated_length": 1043.0,
+      "completions/mean_length": 313.72100830078125,
+      "completions/mean_terminated_length": 313.72100830078125,
+      "completions/min_length": 112.0,
+      "completions/min_terminated_length": 112.0,
+      "epoch": 0.7921609076843734,
+      "grad_norm": 0.2648645341396332,
+      "kl": 0.0235595703125,
+      "learning_rate": 1e-06,
+      "loss": 0.0069,
+      "num_tokens": 82322752.0,
+      "reward": 1.1540179252624512,
+      "reward_std": 0.20486865937709808,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.36349809169769287,
+      "step": 192
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 853.0,
+      "completions/max_terminated_length": 853.0,
+      "completions/mean_length": 295.3504638671875,
+      "completions/mean_terminated_length": 295.3504638671875,
+      "completions/min_length": 117.0,
+      "completions/min_terminated_length": 117.0,
+      "epoch": 0.7962867457452295,
+      "grad_norm": 0.2970978021621704,
+      "kl": 0.0200653076171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0043,
+      "num_tokens": 82728454.0,
+      "reward": 1.196428656578064,
+      "reward_std": 0.23176342248916626,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843171834946,
+      "rewards/curriculum_aware_reward_fn/mean": 0.203125,
+      "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365,
+      "step": 193
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 737.0,
+      "completions/max_terminated_length": 737.0,
+      "completions/mean_length": 310.0401916503906,
+      "completions/mean_terminated_length": 310.0401916503906,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 0.8004125838060856,
+      "grad_norm": 0.2468232959508896,
+      "kl": 0.0216064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0164,
+      "num_tokens": 83131742.0,
+      "reward": 1.1875,
+      "reward_std": 0.20672789216041565,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3942854404449463,
+      "step": 194
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 830.0,
+      "completions/max_terminated_length": 830.0,
+      "completions/mean_length": 301.2633972167969,
+      "completions/mean_terminated_length": 301.2633972167969,
+      "completions/min_length": 116.0,
+      "completions/min_terminated_length": 116.0,
+      "epoch": 0.8045384218669417,
+      "grad_norm": 0.28781455755233765,
+      "kl": 0.02532958984375,
+      "learning_rate": 1e-06,
+      "loss": -0.0154,
+      "num_tokens": 83525491.0,
+      "reward": 1.1830357313156128,
+      "reward_std": 0.21340614557266235,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313,
+      "step": 195
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1114.0,
+      "completions/max_terminated_length": 1114.0,
+      "completions/mean_length": 298.15179443359375,
+      "completions/mean_terminated_length": 298.15179443359375,
+      "completions/min_length": 81.0,
+      "completions/min_terminated_length": 81.0,
+      "epoch": 0.8086642599277978,
+      "grad_norm": 0.25646495819091797,
+      "kl": 0.0185699462890625,
+      "learning_rate": 1e-06,
+      "loss": 0.0112,
+      "num_tokens": 83930551.0,
+      "reward": 1.2008929252624512,
+      "reward_std": 0.21293501555919647,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.203125,
+      "rewards/curriculum_aware_reward_fn/std": 0.4027745723724365,
+      "step": 196
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1275.0,
+      "completions/max_terminated_length": 1275.0,
+      "completions/mean_length": 308.0401916503906,
+      "completions/mean_terminated_length": 308.0401916503906,
+      "completions/min_length": 110.0,
+      "completions/min_terminated_length": 110.0,
+      "epoch": 0.812790097988654,
+      "grad_norm": 0.2715901732444763,
+      "kl": 0.0261383056640625,
+      "learning_rate": 1e-06,
+      "loss": 0.0105,
+      "num_tokens": 84335582.0,
+      "reward": 1.1852679252624512,
+      "reward_std": 0.23676243424415588,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1875,
+      "rewards/curriculum_aware_reward_fn/std": 0.41839686036109924,
+      "step": 197
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 898.0,
+      "completions/max_terminated_length": 898.0,
+      "completions/mean_length": 288.3482360839844,
+      "completions/mean_terminated_length": 288.3482360839844,
+      "completions/min_length": 94.0,
+      "completions/min_terminated_length": 94.0,
+      "epoch": 0.8169159360495101,
+      "grad_norm": 0.2405804991722107,
+      "kl": 0.0204620361328125,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 84718631.0,
+      "reward": 1.1986607313156128,
+      "reward_std": 0.1866028755903244,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1986607164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.3994380533695221,
+      "step": 198
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 838.0,
+      "completions/max_terminated_length": 838.0,
+      "completions/mean_length": 278.7901916503906,
+      "completions/mean_terminated_length": 278.7901916503906,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "epoch": 0.8210417741103662,
+      "grad_norm": 0.30192846059799194,
+      "kl": 0.024627685546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0157,
+      "num_tokens": 85102669.0,
+      "reward": 1.1897321939468384,
+      "reward_std": 0.2407066822052002,
+      "rewards/code_format_reward/mean": 0.9888392686843872,
+      "rewards/code_format_reward/std": 0.10517053306102753,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2008928507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.4011160135269165,
+      "step": 199
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 770.0,
+      "completions/max_terminated_length": 770.0,
+      "completions/mean_length": 292.1763610839844,
+      "completions/mean_terminated_length": 292.1763610839844,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "epoch": 0.8251676121712223,
+      "grad_norm": 0.284535676240921,
+      "kl": 0.0212249755859375,
+      "learning_rate": 1e-06,
+      "loss": 0.0136,
+      "num_tokens": 85515524.0,
+      "reward": 1.1852679252624512,
+      "reward_std": 0.2346602827310562,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1897321492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.39252743124961853,
+      "step": 200
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1026.0,
+      "completions/max_terminated_length": 1026.0,
+      "completions/mean_length": 276.66741943359375,
+      "completions/mean_terminated_length": 276.66741943359375,
+      "completions/min_length": 122.0,
+      "completions/min_terminated_length": 122.0,
+      "epoch": 0.8292934502320783,
+      "grad_norm": 0.2965720593929291,
+      "kl": 0.0219879150390625,
+      "learning_rate": 1e-06,
+      "loss": 0.0276,
+      "num_tokens": 85908173.0,
+      "reward": 1.2165179252624512,
+      "reward_std": 0.2559392750263214,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2209821492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.4153723120689392,
+      "step": 201
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 834.0,
+      "completions/max_terminated_length": 834.0,
+      "completions/mean_length": 283.140625,
+      "completions/mean_terminated_length": 283.140625,
+      "completions/min_length": 107.0,
+      "completions/min_terminated_length": 107.0,
+      "epoch": 0.8334192882929345,
+      "grad_norm": 0.2588541507720947,
+      "kl": 0.0260009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0014,
+      "num_tokens": 86294237.0,
+      "reward": 1.15625,
+      "reward_std": 0.2073148787021637,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.15625,
+      "rewards/curriculum_aware_reward_fn/std": 0.36349809169769287,
+      "step": 202
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 929.0,
+      "completions/max_terminated_length": 929.0,
+      "completions/mean_length": 295.6540222167969,
+      "completions/mean_terminated_length": 295.6540222167969,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "epoch": 0.8375451263537906,
+      "grad_norm": 0.2303047776222229,
+      "kl": 0.0272064208984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0114,
+      "num_tokens": 86708692.0,
+      "reward": 1.1227679252624512,
+      "reward_std": 0.17258599400520325,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.125,
+      "rewards/curriculum_aware_reward_fn/std": 0.3310886323451996,
+      "step": 203
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 736.0,
+      "completions/max_terminated_length": 736.0,
+      "completions/mean_length": 268.2276916503906,
+      "completions/mean_terminated_length": 268.2276916503906,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "epoch": 0.8416709644146467,
+      "grad_norm": 0.28269636631011963,
+      "kl": 0.0269012451171875,
+      "learning_rate": 1e-06,
+      "loss": 0.0143,
+      "num_tokens": 87093689.0,
+      "reward": 1.1674107313156128,
+      "reward_std": 0.18484607338905334,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.171875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3776935040950775,
+      "step": 204
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1485.0,
+      "completions/max_terminated_length": 1485.0,
+      "completions/mean_length": 270.5669860839844,
+      "completions/mean_terminated_length": 270.5669860839844,
+      "completions/min_length": 91.0,
+      "completions/min_terminated_length": 91.0,
+      "epoch": 0.8457968024755028,
+      "grad_norm": 0.3281504511833191,
+      "kl": 0.0283966064453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0012,
+      "num_tokens": 87479685.0,
+      "reward": 1.2209821939468384,
+      "reward_std": 0.28913792967796326,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.4221988022327423,
+      "step": 205
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 878.0,
+      "completions/max_terminated_length": 878.0,
+      "completions/mean_length": 264.1942138671875,
+      "completions/mean_terminated_length": 264.1942138671875,
+      "completions/min_length": 95.0,
+      "completions/min_terminated_length": 95.0,
+      "epoch": 0.8499226405363589,
+      "grad_norm": 0.28296637535095215,
+      "kl": 0.03045654296875,
+      "learning_rate": 1e-06,
+      "loss": 0.0128,
+      "num_tokens": 87859158.0,
+      "reward": 1.1629464626312256,
+      "reward_std": 0.2186012715101242,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1696428507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.37573832273483276,
+      "step": 206
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1376.0,
+      "completions/max_terminated_length": 1376.0,
+      "completions/mean_length": 257.7857360839844,
+      "completions/mean_terminated_length": 257.7857360839844,
+      "completions/min_length": 83.0,
+      "completions/min_terminated_length": 83.0,
+      "epoch": 0.8540484785972151,
+      "grad_norm": 0.325296550989151,
+      "kl": 0.0262603759765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0146,
+      "num_tokens": 88231364.0,
+      "reward": 1.2254464626312256,
+      "reward_std": 0.2650907635688782,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2299107164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.42124560475349426,
+      "step": 207
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 753.0,
+      "completions/max_terminated_length": 753.0,
+      "completions/mean_length": 266.5379638671875,
+      "completions/mean_terminated_length": 266.5379638671875,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 100.0,
+      "epoch": 0.8581743166580712,
+      "grad_norm": 0.2887232303619385,
+      "kl": 0.0287933349609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0168,
+      "num_tokens": 88611985.0,
+      "reward": 1.2477679252624512,
+      "reward_std": 0.24733349680900574,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.25,
+      "rewards/curriculum_aware_reward_fn/std": 0.43349677324295044,
+      "step": 208
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 787.0,
+      "completions/mean_length": 274.9933166503906,
+      "completions/mean_terminated_length": 266.4451904296875,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "epoch": 0.8623001547189273,
+      "grad_norm": 0.2456807792186737,
+      "kl": 0.0304107666015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0256,
+      "num_tokens": 89014062.0,
+      "reward": 1.15625,
+      "reward_std": 0.20821687579154968,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843171834946,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1629464328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3697296380996704,
+      "step": 209
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 780.0,
+      "completions/mean_length": 269.3727722167969,
+      "completions/mean_terminated_length": 260.81207275390625,
+      "completions/min_length": 80.0,
+      "completions/min_terminated_length": 80.0,
+      "epoch": 0.8664259927797834,
+      "grad_norm": 0.311091810464859,
+      "kl": 0.0285797119140625,
+      "learning_rate": 1e-06,
+      "loss": 0.034,
+      "num_tokens": 89401434.0,
+      "reward": 1.2299107313156128,
+      "reward_std": 0.251747727394104,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2366071492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.4254741966724396,
+      "step": 210
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 538.0,
+      "completions/max_terminated_length": 538.0,
+      "completions/mean_length": 240.2366180419922,
+      "completions/mean_terminated_length": 240.2366180419922,
+      "completions/min_length": 91.0,
+      "completions/min_terminated_length": 91.0,
+      "epoch": 0.8705518308406395,
+      "grad_norm": 0.29727450013160706,
+      "kl": 0.0320892333984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0009,
+      "num_tokens": 89763790.0,
+      "reward": 1.1473214626312256,
+      "reward_std": 0.20072634518146515,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1495535671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.36324387788772583,
+      "step": 211
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 801.0,
+      "completions/max_terminated_length": 801.0,
+      "completions/mean_length": 252.15179443359375,
+      "completions/mean_terminated_length": 252.15179443359375,
+      "completions/min_length": 106.0,
+      "completions/min_terminated_length": 106.0,
+      "epoch": 0.8746776689014956,
+      "grad_norm": 0.30429133772850037,
+      "kl": 0.0283050537109375,
+      "learning_rate": 1e-06,
+      "loss": 0.0084,
+      "num_tokens": 90140269.0,
+      "reward": 1.1941964626312256,
+      "reward_std": 0.2299019694328308,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1964285671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.39774051308631897,
+      "step": 212
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 953.0,
+      "completions/max_terminated_length": 953.0,
+      "completions/mean_length": 262.7433166503906,
+      "completions/mean_terminated_length": 262.7433166503906,
+      "completions/min_length": 97.0,
+      "completions/min_terminated_length": 97.0,
+      "epoch": 0.8788035069623518,
+      "grad_norm": 0.33080312609672546,
+      "kl": 0.0282135009765625,
+      "learning_rate": 1e-06,
+      "loss": 0.0027,
+      "num_tokens": 90529879.0,
+      "reward": 1.2388393878936768,
+      "reward_std": 0.2763974070549011,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2410714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.42821168899536133,
+      "step": 213
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 835.0,
+      "completions/max_terminated_length": 835.0,
+      "completions/mean_length": 266.51788330078125,
+      "completions/mean_terminated_length": 266.51788330078125,
+      "completions/min_length": 115.0,
+      "completions/min_terminated_length": 115.0,
+      "epoch": 0.8829293450232079,
+      "grad_norm": 0.2860460877418518,
+      "kl": 0.026336669921875,
+      "learning_rate": 1e-06,
+      "loss": -0.0056,
+      "num_tokens": 90929384.0,
+      "reward": 1.2142857313156128,
+      "reward_std": 0.24735590815544128,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.21875,
+      "rewards/curriculum_aware_reward_fn/std": 0.4138607978820801,
+      "step": 214
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 890.0,
+      "completions/max_terminated_length": 890.0,
+      "completions/mean_length": 262.37725830078125,
+      "completions/mean_terminated_length": 262.37725830078125,
+      "completions/min_length": 88.0,
+      "completions/min_terminated_length": 88.0,
+      "epoch": 0.887055183084064,
+      "grad_norm": 0.30841729044914246,
+      "kl": 0.027740478515625,
+      "learning_rate": 1e-06,
+      "loss": -0.0033,
+      "num_tokens": 91306111.0,
+      "reward": 1.1897321939468384,
+      "reward_std": 0.23583011329174042,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1919642835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3942854106426239,
+      "step": 215
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 709.0,
+      "completions/max_terminated_length": 709.0,
+      "completions/mean_length": 256.8482360839844,
+      "completions/mean_terminated_length": 256.8482360839844,
+      "completions/min_length": 86.0,
+      "completions/min_terminated_length": 86.0,
+      "epoch": 0.89118102114492,
+      "grad_norm": 11.453292846679688,
+      "kl": 0.8353729248046875,
+      "learning_rate": 1e-06,
+      "loss": 0.0193,
+      "num_tokens": 91690343.0,
+      "reward": 1.196428656578064,
+      "reward_std": 0.23445691168308258,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2053571492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.40441396832466125,
+      "step": 216
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 666.0,
+      "completions/max_terminated_length": 666.0,
+      "completions/mean_length": 260.1651916503906,
+      "completions/mean_terminated_length": 260.1651916503906,
+      "completions/min_length": 101.0,
+      "completions/min_terminated_length": 101.0,
+      "epoch": 0.8953068592057761,
+      "grad_norm": 0.2754879891872406,
+      "kl": 0.0255889892578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0106,
+      "num_tokens": 92069019.0,
+      "reward": 1.1629464626312256,
+      "reward_std": 0.20804257690906525,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457,
+      "step": 217
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 688.0,
+      "completions/max_terminated_length": 688.0,
+      "completions/mean_length": 244.32366943359375,
+      "completions/mean_terminated_length": 244.32366943359375,
+      "completions/min_length": 78.0,
+      "completions/min_terminated_length": 78.0,
+      "epoch": 0.8994326972666323,
+      "grad_norm": 0.31076574325561523,
+      "kl": 0.0318450927734375,
+      "learning_rate": 1e-06,
+      "loss": 0.0066,
+      "num_tokens": 92435873.0,
+      "reward": 1.2254464626312256,
+      "reward_std": 0.2242594063282013,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2276785671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.41980281472206116,
+      "step": 218
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1105.0,
+      "completions/max_terminated_length": 1105.0,
+      "completions/mean_length": 259.37054443359375,
+      "completions/mean_terminated_length": 259.37054443359375,
+      "completions/min_length": 92.0,
+      "completions/min_terminated_length": 92.0,
+      "epoch": 0.9035585353274884,
+      "grad_norm": 0.3404967188835144,
+      "kl": 0.029510498046875,
+      "learning_rate": 1e-06,
+      "loss": -0.0106,
+      "num_tokens": 92822418.0,
+      "reward": 1.2098214626312256,
+      "reward_std": 0.26485005021095276,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.4177219867706299,
+      "step": 219
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1197.0,
+      "completions/mean_length": 266.6473388671875,
+      "completions/mean_terminated_length": 258.0805358886719,
+      "completions/min_length": 86.0,
+      "completions/min_terminated_length": 86.0,
+      "epoch": 0.9076843733883445,
+      "grad_norm": 0.32294103503227234,
+      "kl": 0.0294342041015625,
+      "learning_rate": 1e-06,
+      "loss": 0.0281,
+      "num_tokens": 93196529.0,
+      "reward": 1.2544643878936768,
+      "reward_std": 0.270959734916687,
+      "rewards/code_format_reward/mean": 0.9799107313156128,
+      "rewards/code_format_reward/std": 0.14046260714530945,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2745535671710968,
+      "rewards/curriculum_aware_reward_fn/std": 0.4467879831790924,
+      "step": 220
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1081.0,
+      "completions/max_terminated_length": 1081.0,
+      "completions/mean_length": 262.7098388671875,
+      "completions/mean_terminated_length": 262.7098388671875,
+      "completions/min_length": 102.0,
+      "completions/min_terminated_length": 102.0,
+      "epoch": 0.9118102114492006,
+      "grad_norm": 0.25437629222869873,
+      "kl": 0.0256500244140625,
+      "learning_rate": 1e-06,
+      "loss": 0.0015,
+      "num_tokens": 93565943.0,
+      "reward": 1.21875,
+      "reward_std": 0.1944902390241623,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2209821492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.4153723120689392,
+      "step": 221
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 724.0,
+      "completions/max_terminated_length": 724.0,
+      "completions/mean_length": 261.0044860839844,
+      "completions/mean_terminated_length": 261.0044860839844,
+      "completions/min_length": 61.0,
+      "completions/min_terminated_length": 61.0,
+      "epoch": 0.9159360495100567,
+      "grad_norm": 0.2887217104434967,
+      "kl": 0.032867431640625,
+      "learning_rate": 1e-06,
+      "loss": -0.0008,
+      "num_tokens": 93946353.0,
+      "reward": 1.1629464626312256,
+      "reward_std": 0.2028724104166031,
+      "rewards/code_format_reward/mean": 0.984375,
+      "rewards/code_format_reward/std": 0.12415824085474014,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3834212124347687,
+      "step": 222
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1376.0,
+      "completions/max_terminated_length": 1376.0,
+      "completions/mean_length": 297.7790222167969,
+      "completions/mean_terminated_length": 297.7790222167969,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "epoch": 0.9200618875709129,
+      "grad_norm": 0.31769412755966187,
+      "kl": 0.024017333984375,
+      "learning_rate": 1e-06,
+      "loss": 0.0136,
+      "num_tokens": 94344953.0,
+      "reward": 1.1473214626312256,
+      "reward_std": 0.2656678259372711,
+      "rewards/code_format_reward/mean": 0.9732142686843872,
+      "rewards/code_format_reward/std": 0.1616371124982834,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1741071492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.39691102504730225,
+      "step": 223
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 873.0,
+      "completions/max_terminated_length": 873.0,
+      "completions/mean_length": 299.4977722167969,
+      "completions/mean_terminated_length": 299.4977722167969,
+      "completions/min_length": 64.0,
+      "completions/min_terminated_length": 64.0,
+      "epoch": 0.924187725631769,
+      "grad_norm": 0.30828657746315,
+      "kl": 0.0286712646484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0251,
+      "num_tokens": 94757014.0,
+      "reward": 1.1473214626312256,
+      "reward_std": 0.26786863803863525,
+      "rewards/code_format_reward/mean": 0.9598214030265808,
+      "rewards/code_format_reward/std": 0.1965973675251007,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313,
+      "step": 224
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 2189.0,
+      "completions/max_terminated_length": 2189.0,
+      "completions/mean_length": 278.9576110839844,
+      "completions/mean_terminated_length": 278.9576110839844,
+      "completions/min_length": 118.0,
+      "completions/min_terminated_length": 118.0,
+      "epoch": 0.9283135636926251,
+      "grad_norm": 0.3763558864593506,
+      "kl": 0.026763916015625,
+      "learning_rate": 1e-06,
+      "loss": 0.016,
+      "num_tokens": 95150605.0,
+      "reward": 1.1763393878936768,
+      "reward_std": 0.3294941782951355,
+      "rewards/code_format_reward/mean": 0.9464285969734192,
+      "rewards/code_format_reward/std": 0.2254217267036438,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2299107164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.42124560475349426,
+      "step": 225
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 905.0,
+      "completions/max_terminated_length": 905.0,
+      "completions/mean_length": 283.1785888671875,
+      "completions/mean_terminated_length": 283.1785888671875,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "epoch": 0.9324394017534812,
+      "grad_norm": 0.37017151713371277,
+      "kl": 0.0257415771484375,
+      "learning_rate": 1e-06,
+      "loss": 0.0236,
+      "num_tokens": 95541054.0,
+      "reward": 1.1696429252624512,
+      "reward_std": 0.2892906069755554,
+      "rewards/code_format_reward/mean": 0.953125,
+      "rewards/code_format_reward/std": 0.21160738170146942,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2165178507566452,
+      "rewards/curriculum_aware_reward_fn/std": 0.41233164072036743,
+      "step": 226
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0022321428571429047,
+      "completions/max_length": 4096.0,
+      "completions/max_terminated_length": 1383.0,
+      "completions/mean_length": 281.75225830078125,
+      "completions/mean_terminated_length": 273.21923828125,
+      "completions/min_length": 98.0,
+      "completions/min_terminated_length": 98.0,
+      "epoch": 0.9365652398143373,
+      "grad_norm": 0.3290660083293915,
+      "kl": 0.0265350341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.0388,
+      "num_tokens": 95929912.0,
+      "reward": 1.2388393878936768,
+      "reward_std": 0.27593371272087097,
+      "rewards/code_format_reward/mean": 0.9754464030265808,
+      "rewards/code_format_reward/std": 0.1549331247806549,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2633928656578064,
+      "rewards/curriculum_aware_reward_fn/std": 0.44096609950065613,
+      "step": 227
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1050.0,
+      "completions/max_terminated_length": 1050.0,
+      "completions/mean_length": 293.28350830078125,
+      "completions/mean_terminated_length": 293.28350830078125,
+      "completions/min_length": 44.0,
+      "completions/min_terminated_length": 44.0,
+      "epoch": 0.9406910778751933,
+      "grad_norm": 0.300258994102478,
+      "kl": 0.0255126953125,
+      "learning_rate": 1e-06,
+      "loss": 0.0039,
+      "num_tokens": 96326386.0,
+      "reward": 1.1897321939468384,
+      "reward_std": 0.2706966996192932,
+      "rewards/code_format_reward/mean": 0.96875,
+      "rewards/code_format_reward/std": 0.17418713867664337,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2209821492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.4153723120689392,
+      "step": 228
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1864.0,
+      "completions/max_terminated_length": 1864.0,
+      "completions/mean_length": 316.2477722167969,
+      "completions/mean_terminated_length": 316.2477722167969,
+      "completions/min_length": 93.0,
+      "completions/min_terminated_length": 93.0,
+      "epoch": 0.9448169159360496,
+      "grad_norm": 0.29400742053985596,
+      "kl": 0.0277099609375,
+      "learning_rate": 1e-06,
+      "loss": 0.0429,
+      "num_tokens": 96749164.0,
+      "reward": 1.1227679252624512,
+      "reward_std": 0.2612314522266388,
+      "rewards/code_format_reward/mean": 0.9620535969734192,
+      "rewards/code_format_reward/std": 0.19128035008907318,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1607142835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.3676777780056,
+      "step": 229
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 690.0,
+      "completions/max_terminated_length": 690.0,
+      "completions/mean_length": 284.5535888671875,
+      "completions/mean_terminated_length": 284.5535888671875,
+      "completions/min_length": 99.0,
+      "completions/min_terminated_length": 99.0,
+      "epoch": 0.9489427539969056,
+      "grad_norm": 0.2488163560628891,
+      "kl": 0.029022216796875,
+      "learning_rate": 1e-06,
+      "loss": 0.008,
+      "num_tokens": 97146046.0,
+      "reward": 1.1696429252624512,
+      "reward_std": 0.17548497021198273,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1785714328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.3834212124347687,
+      "step": 230
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 949.0,
+      "completions/max_terminated_length": 949.0,
+      "completions/mean_length": 291.2165222167969,
+      "completions/mean_terminated_length": 291.2165222167969,
+      "completions/min_length": 87.0,
+      "completions/min_terminated_length": 87.0,
+      "epoch": 0.9530685920577617,
+      "grad_norm": 0.2761755883693695,
+      "kl": 0.02569580078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0327,
+      "num_tokens": 97538143.0,
+      "reward": 1.2165179252624512,
+      "reward_std": 0.2395087480545044,
+      "rewards/code_format_reward/mean": 0.9866071343421936,
+      "rewards/code_format_reward/std": 0.11507843434810638,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2299107164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.42124560475349426,
+      "step": 231
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 718.0,
+      "completions/max_terminated_length": 718.0,
+      "completions/mean_length": 279.234375,
+      "completions/mean_terminated_length": 279.234375,
+      "completions/min_length": 83.0,
+      "completions/min_terminated_length": 83.0,
+      "epoch": 0.9571944301186178,
+      "grad_norm": 0.2993510961532593,
+      "kl": 0.0256805419921875,
+      "learning_rate": 1e-06,
+      "loss": 0.0048,
+      "num_tokens": 97929947.0,
+      "reward": 1.2165179252624512,
+      "reward_std": 0.2513192594051361,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915,
+      "step": 232
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 720.0,
+      "completions/max_terminated_length": 720.0,
+      "completions/mean_length": 290.8169860839844,
+      "completions/mean_terminated_length": 290.8169860839844,
+      "completions/min_length": 100.0,
+      "completions/min_terminated_length": 100.0,
+      "epoch": 0.9613202681794739,
+      "grad_norm": 0.2771032750606537,
+      "kl": 0.0251007080078125,
+      "learning_rate": 1e-06,
+      "loss": 0.0053,
+      "num_tokens": 98328032.0,
+      "reward": 1.1897321939468384,
+      "reward_std": 0.21872369945049286,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1986607164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.3994380831718445,
+      "step": 233
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1063.0,
+      "completions/max_terminated_length": 1063.0,
+      "completions/mean_length": 291.54241943359375,
+      "completions/mean_terminated_length": 291.54241943359375,
+      "completions/min_length": 91.0,
+      "completions/min_terminated_length": 91.0,
+      "epoch": 0.9654461062403301,
+      "grad_norm": 0.26633113622665405,
+      "kl": 0.02685546875,
+      "learning_rate": 1e-06,
+      "loss": 0.0203,
+      "num_tokens": 98730600.0,
+      "reward": 1.125,
+      "reward_std": 0.19034792482852936,
+      "rewards/code_format_reward/mean": 0.9933035969734192,
+      "rewards/code_format_reward/std": 0.08164843916893005,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133,
+      "step": 234
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1147.0,
+      "completions/max_terminated_length": 1147.0,
+      "completions/mean_length": 300.8326110839844,
+      "completions/mean_terminated_length": 300.8326110839844,
+      "completions/min_length": 113.0,
+      "completions/min_terminated_length": 113.0,
+      "epoch": 0.9695719443011862,
+      "grad_norm": 0.2591470181941986,
+      "kl": 0.0254974365234375,
+      "learning_rate": 1e-06,
+      "loss": 0.01,
+      "num_tokens": 99136343.0,
+      "reward": 1.2366071939468384,
+      "reward_std": 0.22157251834869385,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2388392835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.43206024169921875,
+      "step": 235
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 738.0,
+      "completions/max_terminated_length": 738.0,
+      "completions/mean_length": 285.0223388671875,
+      "completions/mean_terminated_length": 285.0223388671875,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "epoch": 0.9736977823620423,
+      "grad_norm": 0.2993071973323822,
+      "kl": 0.03564453125,
+      "learning_rate": 1e-06,
+      "loss": 0.0079,
+      "num_tokens": 99538489.0,
+      "reward": 1.1473214626312256,
+      "reward_std": 0.218577578663826,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1517857164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.359214186668396,
+      "step": 236
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1059.0,
+      "completions/max_terminated_length": 1059.0,
+      "completions/mean_length": 304.1651916503906,
+      "completions/mean_terminated_length": 304.1651916503906,
+      "completions/min_length": 109.0,
+      "completions/min_terminated_length": 109.0,
+      "epoch": 0.9778236204228984,
+      "grad_norm": 0.25762394070625305,
+      "kl": 0.029205322265625,
+      "learning_rate": 1e-06,
+      "loss": 0.0112,
+      "num_tokens": 99931039.0,
+      "reward": 1.1830357313156128,
+      "reward_std": 0.22913821041584015,
+      "rewards/code_format_reward/mean": 0.9955357313156128,
+      "rewards/code_format_reward/std": 0.06674052774906158,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1875,
+      "rewards/curriculum_aware_reward_fn/std": 0.3907487094402313,
+      "step": 237
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 910.0,
+      "completions/max_terminated_length": 910.0,
+      "completions/mean_length": 295.3348388671875,
+      "completions/mean_terminated_length": 295.3348388671875,
+      "completions/min_length": 96.0,
+      "completions/min_terminated_length": 96.0,
+      "epoch": 0.9819494584837545,
+      "grad_norm": 0.3165225386619568,
+      "kl": 0.02850341796875,
+      "learning_rate": 1e-06,
+      "loss": 0.012,
+      "num_tokens": 100339303.0,
+      "reward": 1.2142857313156128,
+      "reward_std": 0.27128201723098755,
+      "rewards/code_format_reward/mean": 0.9910714030265808,
+      "rewards/code_format_reward/std": 0.09417349100112915,
+      "rewards/curriculum_aware_reward_fn/mean": 0.2232142835855484,
+      "rewards/curriculum_aware_reward_fn/std": 0.41686636209487915,
+      "step": 238
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 3986.0,
+      "completions/max_terminated_length": 3986.0,
+      "completions/mean_length": 316.00225830078125,
+      "completions/mean_terminated_length": 316.00225830078125,
+      "completions/min_length": 103.0,
+      "completions/min_terminated_length": 103.0,
+      "epoch": 0.9860752965446106,
+      "grad_norm": 0.25554707646369934,
+      "kl": 0.02362060546875,
+      "learning_rate": 1e-06,
+      "loss": -0.0092,
+      "num_tokens": 100748125.0,
+      "reward": 1.1741071939468384,
+      "reward_std": 0.1868884414434433,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1741071492433548,
+      "rewards/curriculum_aware_reward_fn/std": 0.37962549924850464,
+      "step": 239
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 827.0,
+      "completions/max_terminated_length": 827.0,
+      "completions/mean_length": 315.2321472167969,
+      "completions/mean_terminated_length": 315.2321472167969,
+      "completions/min_length": 111.0,
+      "completions/min_terminated_length": 111.0,
+      "epoch": 0.9902011346054668,
+      "grad_norm": 0.4484083354473114,
+      "kl": 0.05804443359375,
+      "learning_rate": 1e-06,
+      "loss": 0.005,
+      "num_tokens": 101162104.0,
+      "reward": 1.165178656578064,
+      "reward_std": 0.223335862159729,
+      "rewards/code_format_reward/mean": 0.9977678656578064,
+      "rewards/code_format_reward/std": 0.047245558351278305,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1674107164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.37375950813293457,
+      "step": 240
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 1037.0,
+      "completions/max_terminated_length": 1037.0,
+      "completions/mean_length": 303.9665222167969,
+      "completions/mean_terminated_length": 303.9665222167969,
+      "completions/min_length": 89.0,
+      "completions/min_terminated_length": 89.0,
+      "epoch": 0.9943269726663229,
+      "grad_norm": 0.21845290064811707,
+      "kl": 0.025726318359375,
+      "learning_rate": 1e-06,
+      "loss": 0.0091,
+      "num_tokens": 101574571.0,
+      "reward": 1.1316964626312256,
+      "reward_std": 0.15273067355155945,
+      "rewards/code_format_reward/mean": 1.0,
+      "rewards/code_format_reward/std": 0.0,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1316964328289032,
+      "rewards/curriculum_aware_reward_fn/std": 0.33853843808174133,
+      "step": 241
+    },
+    {
+      "clip_ratio/high_max": 0.0,
+      "clip_ratio/high_mean": 0.0,
+      "clip_ratio/low_mean": 0.0,
+      "clip_ratio/low_min": 0.0,
+      "clip_ratio/region_mean": 0.0,
+      "completions/clipped_ratio": 0.0,
+      "completions/max_length": 848.0,
+      "completions/max_terminated_length": 848.0,
+      "completions/mean_length": 299.19708251953125,
+      "completions/mean_terminated_length": 299.19708251953125,
+      "completions/min_length": 126.0,
+      "completions/min_terminated_length": 126.0,
+      "epoch": 0.998452810727179,
+      "grad_norm": 0.26789233088493347,
+      "kl": 0.0255889892578125,
+      "learning_rate": 1e-06,
+      "loss": 0.0175,
+      "num_tokens": 101989150.0,
+      "reward": 1.1674107313156128,
+      "reward_std": 0.21147961914539337,
+      "rewards/code_format_reward/mean": 0.984375,
+      "rewards/code_format_reward/std": 0.12415824085474014,
+      "rewards/curriculum_aware_reward_fn/mean": 0.1830357164144516,
+      "rewards/curriculum_aware_reward_fn/std": 0.387128084897995,
+      "step": 242
+    },
+    {
+      "epoch": 0.998452810727179,
+      "step": 242,
+      "total_flos": 0.0,
+      "train_loss": 0.012992730054614463,
+      "train_runtime": 16504.2621,
+      "train_samples_per_second": 0.94,
+      "train_steps_per_second": 0.015
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 242,
+  "num_input_tokens_seen": 101989150,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 0.0,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}